Message Schedule
SHA1MSG1
,
SHA1MSG2
,
SHA1NEXTE
and
SHA1RNDS4
InstructionsSHA256MSG1
,
SHA256MSG2
and
SHA256RNDS2
InstructionsVSHA512MSG1
,
VSHA512MSG2
and
VSHA512RNDS2
InstructionsAVX2
Instructions for GNU asSHA512
and AVX2
Instructions for MASMAlso see the IETF’s RFCs 3174, 3874, 4634 and 6234.
operationROTRn(x) = (x>>n)∨(x<<w−n) = (x<<w−n)⊕(x>>n) defined in FIPS Publication 180-4 is implemented as preprocessor macro
ROTR(x, n)
or
referenced by this name.
functionCh(x,y,z) = (x∧y)⊕(¬x∧z) = (x∧y)∨(¬x∧z) is implemented in optimised form as preprocessor macro
CH(x, y, z)
or referenced by this name.
functionMaj(x,y,z) = (x∧y)⊕(x∧z)⊕(y∧z) = (x∧y)∨(x∧z)∨(y∧z) is implemented in optimised form as preprocessor macro
MAJ(x, y, z)
or referenced by this name.
functionsΣ0(x) and Σ1(x) are implemented as preprocessor macros
SIGMA_0(x)
and SIGMA_1(x)
or
referenced by these names; the functionsσ0(x) and σ1(x) are implemented as
SMALL_0(x)
and SMALL_1(x)
or referenced by
these names.
The interfaces of the
SHA-1,
SHA-256 and
SHA-512 functions
presented below follow those of the
SHA-1 functions
A_SHAFinal()
,
A_SHAInit()
and
A_SHAUpdate()
exported from Windows’
NTDLL.dll
:
Windows NT Cryptographic Providers
// Copyleft © 2004-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>
#define STRICT
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
__declspec(deprecated("undocumented interface, use at your own risk"))
typedef struct _A_SHA_CTX
{
DWORD Block[16];
DWORD State[5]; // {0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0}
DWORD Count[2]; // {0, 0}
} A_SHA_CTX;
__declspec(dllimport)
VOID NTAPI A_SHAFinal(A_SHA_CTX *Context, BYTE Digest[20]);
__declspec(dllimport)
VOID NTAPI A_SHAInit(A_SHA_CTX *Context);
__declspec(dllimport)
VOID NTAPI A_SHAUpdate(A_SHA_CTX *Context, LPCVOID Buffer, DWORD BufferSize);
On December 15, 2022, NIST finally announced NIST Transitioning Away from SHA-1 for All Applications and NIST Retires SHA-1 Cryptographic Algorithm.
// Copyright © 2004-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>
// * The software is provided "as is" without any warranty, neither express
// nor implied.
// * In no event will the author be held liable for any damage(s) arising
// from the use of the software.
// * Redistribution of the software is allowed only in unmodified form.
// * Permission is granted to use the software solely for personal private
// and non-commercial purposes.
// * An individuals use of the software in his or her capacity or function
// as an agent, (independent) contractor, employee, member or officer of
// a business, corporation or organization (commercial or non-commercial)
// does not qualify as personal private and non-commercial purpose.
// * Without written approval from the author the software must not be used
// for a business, for commercial, corporate, governmental, military or
// organizational purposes of any kind, or in a commercial, corporate,
// governmental, military or organizational environment of any kind.
#define ROTR(m, n) (m << 32 - n) ^ (m >> n)
#define SMALL_0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ (x >> 3))
#define SMALL_1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ (x >> 10))
#define SIGMA_0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
#define SIGMA_1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
#if 0
#define CH(x, y, z) ((x & y) ^ (~x & z))
#define MAJ(x, y, z) ((x & y) ^ (x & z) ^ (y & z))
#else
#define CH(x, y, z) ((x & (y ^ z)) ^ z)
#define MAJ(x, y, z) ((x & y) ^ ((x ^ y) & z))
#endif
typedef struct _sha256_ctx {
unsigned int state[8], count[2], block[16];
} sha256_ctx;
static const unsigned int k[64] = {
0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, 0x3956C25B, 0x59F111F1,
0x923F82A4, 0xAB1C5ED5, 0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174, 0xE49B69C1, 0xEFBE4786,
0x0FC19DC6, 0x240CA1CC, 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, 0xC6E00BF3, 0xD5A79147,
0x06CA6351, 0x14292967, 0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85, 0xA2BFE8A1, 0xA81A664B,
0xC24B8B70, 0xC76C51A3, 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, 0x391C0CB3, 0x4ED8AA4A,
0x5B9CCA4F, 0x682E6FF3, 0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
};
#include <arpa/inet.h> // for htonl() and ntohl() functions
void sha256_core(sha256_ctx *context)
{
unsigned int a, b, c, d, e, f, g, h, i, j, t, w[64];
t = 0;
do // load block into "message schedule" w
w[t] = ntohl(context->block[t]);
while (++t < 16);
do // expand "message schedule"
w[t] = w[t - 16] + SMALL_0(w[t - 15]) + w[t - 7] + SMALL_1(w[t - 2]);
while (++t < 64);
// load state into "working variables" a to h
a = context->state[0];
b = context->state[1];
c = context->state[2];
d = context->state[3];
e = context->state[4];
f = context->state[5];
g = context->state[6];
h = context->state[7];
t = 0;
do { // scramble "working variables"
j = SIGMA_0(a) + MAJ(a, b, c);
i = SIGMA_1(e) + CH(e, f, g) + h + k[t] + w[t];
h = g; g = f; f = e; e = d + i;
d = c; c = b; b = a; a = i + j;
} while (++t < 64);
// add "working variables" to state
context->state[0] += a;
context->state[1] += b;
context->state[2] += c;
context->state[3] += d;
context->state[4] += e;
context->state[5] += f;
context->state[6] += g;
context->state[7] += h;
}
#include <string.h> // for memcpy() and memset() functions
void sha256_final(sha256_ctx *context, unsigned char digest[32])
{
unsigned int c = context->count[0] & 63;
unsigned char *b = (unsigned char *) context->block + c;
*b++ = 128; // pad block with 0b10000000...
if (c < 56)
memset(b, 0, 55 - c);
else {
memset(b, 0, 63 - c);
sha256_core(context); // process full block
memset(context->block, 0, 56);
}
// set message length in bits and network byte-order
context->block[14] = htonl(context->count[1] << 3
| context->count[0] >> 29);
context->block[15] = htonl(context->count[0] << 3);
sha256_core(context); // process last block
c = 0;
do // convert state to network byte-order
context->state[c] = htonl(context->state[c]);
while (++c < 8);
// write message digest
memcpy(digest, context->state, 32);
}
void sha256_init(sha256_ctx *context)
{
context->count[0] = 0; context->count[1] = 0;
context->state[0] = 0x6A09E667; context->state[1] = 0xBB67AE85;
context->state[2] = 0x3C6EF372; context->state[3] = 0xA54FF53A;
context->state[4] = 0x510E527F; context->state[5] = 0x9B05688C;
context->state[6] = 0x1F83D9AB; context->state[7] = 0x5BE0CD19;
}
void sha256_update(sha256_ctx *context, unsigned char const *data, unsigned int size)
{
unsigned int c = context->count[0] & 63, free = 64 - c;
unsigned char *b = (unsigned char *) context->block + c;
context->count[0] += size; // update message length
if (context->count[0] < size)
context->count[1]++;
while (size > 0) { // copy message data into block
if (size < free) {
memcpy(b, data, size);
break;
}
memcpy(b, data, free);
sha256_core(context); // process full block
data += free;
size -= free;
free = 64;
}
}
#include <stdio.h>
#include <time.h>
int main()
{
unsigned char digest[32], million[1000000];
unsigned int n = 1024 * 1024 * 1024 / 64;
clock_t t;
sha256_ctx context;
sha256_init(&context);
sha256_final(&context, digest);
printf("\"\"\n"
"\te3b0c442 98fc1c14 9afbf4c8 996fb924 27ae41e4 649b934c a495991b 7852b855\n"
"\t%08X %08X %08X %08X %08X %08X %08X %08X\n",
ntohl(context.state[0]), ntohl(context.state[1]),
ntohl(context.state[2]), ntohl(context.state[3]),
ntohl(context.state[4]), ntohl(context.state[5]),
ntohl(context.state[6]), ntohl(context.state[7]));
sha256_init(&context);
sha256_update(&context, "abc", 3);
sha256_final(&context, digest);
printf("\"abc\"\n"
"\tba7816bf 8f01cfea 414140de 5dae2223 b00361a3 96177a9c b410ff61 f20015ad\n"
"\t%08X %08X %08X %08X %08X %08X %08X %08X\n",
ntohl(context.state[0]), ntohl(context.state[1]),
ntohl(context.state[2]), ntohl(context.state[3]),
ntohl(context.state[4]), ntohl(context.state[5]),
ntohl(context.state[6]), ntohl(context.state[7]));
sha256_init(&context);
sha256_update(&context, "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq",
sizeof("abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq") - 1);
sha256_final(&context, digest);
printf("\"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq\"\n"
"\t248d6a61 d20638b8 e5c02693 0c3e6039 a33ce459 64ff2167 f6ecedd4 19db06c1\n"
"\t%08X %08X %08X %08X %08X %08X %08X %08X\n",
ntohl(context.state[0]), ntohl(context.state[1]),
ntohl(context.state[2]), ntohl(context.state[3]),
ntohl(context.state[4]), ntohl(context.state[5]),
ntohl(context.state[6]), ntohl(context.state[7]));
sha256_init(&context);
sha256_update(&context, "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu",
sizeof("abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu") - 1);
sha256_final(&context, digest);
printf("\"abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu\"\n"
"\tcf5b16a7 78af8380 036ce59e 7b049237 0b249b11 e8f07a51 afac4503 7afee9d1\n"
"\t%08X %08X %08X %08X %08X %08X %08X %08X\n",
ntohl(context.state[0]), ntohl(context.state[1]),
ntohl(context.state[2]), ntohl(context.state[3]),
ntohl(context.state[4]), ntohl(context.state[5]),
ntohl(context.state[6]), ntohl(context.state[7]));
sha256_init(&context);
sha256_update(&context, "\xBD", 1);
sha256_final(&context, digest);
printf("\"\\xBD\"\n"
"\t68325720 aabd7c82 f30f554b 313d0570 c95accbb 7dc4b5aa e11204c0 8ffe732b\n"
"\t%08X %08X %08X %08X %08X %08X %08X %08X\n",
ntohl(context.state[0]), ntohl(context.state[1]),
ntohl(context.state[2]), ntohl(context.state[3]),
ntohl(context.state[4]), ntohl(context.state[5]),
ntohl(context.state[6]), ntohl(context.state[7]));
sha256_init(&context);
sha256_update(&context, "\xC9\x8C\x8E\x55", 4);
sha256_final(&context, digest);
printf("\"\\xC9\\x8C\\x8E\\x55\"\n"
"\t7abc22c0 ae5af26c e93dbb94 433a0e0b 2e119d01 4f8e7f65 bd56c61c cccd9504\n"
"\t%08X %08X %08X %08X %08X %08X %08X %08X\n",
ntohl(context.state[0]), ntohl(context.state[1]),
ntohl(context.state[2]), ntohl(context.state[3]),
ntohl(context.state[4]), ntohl(context.state[5]),
ntohl(context.state[6]), ntohl(context.state[7]));
memset(million, 0, 1000);
sha256_init(&context);
sha256_update(&context, million, 55);
sha256_final(&context, digest);
printf("\'\\0\'*55\n"
"\t02779466 cdec1638 11d07881 5c633f21 90141308 1449002f 24aa3e80 f0b88ef7\n"
"\t%08X %08X %08X %08X %08X %08X %08X %08X\n",
ntohl(context.state[0]), ntohl(context.state[1]),
ntohl(context.state[2]), ntohl(context.state[3]),
ntohl(context.state[4]), ntohl(context.state[5]),
ntohl(context.state[6]), ntohl(context.state[7]));
sha256_init(&context);
sha256_update(&context, million, 56);
sha256_final(&context, digest);
printf("\'\\0\'*56\n"
"\td4817aa5 497628e7 c77e6b60 6107042b bba31308 88c5f47a 375e6179 be789fbb\n"
"\t%08X %08X %08X %08X %08X %08X %08X %08X\n",
ntohl(context.state[0]), ntohl(context.state[1]),
ntohl(context.state[2]), ntohl(context.state[3]),
ntohl(context.state[4]), ntohl(context.state[5]),
ntohl(context.state[6]), ntohl(context.state[7]));
sha256_init(&context);
sha256_update(&context, million, 57);
sha256_final(&context, digest);
printf("\'\\0\'*57\n"
"\t65a16cb7 861335d5 ace3c607 18b5052e 44660726 da4cd13b b745381b 235a1785\n"
"\t%08X %08X %08X %08X %08X %08X %08X %08X\n",
ntohl(context.state[0]), ntohl(context.state[1]),
ntohl(context.state[2]), ntohl(context.state[3]),
ntohl(context.state[4]), ntohl(context.state[5]),
ntohl(context.state[6]), ntohl(context.state[7]));
sha256_init(&context);
sha256_update(&context, million, 64);
sha256_final(&context, digest);
printf("\'\\0\'*64\n"
"\tf5a5fd42 d16a2030 2798ef6e d309979b 43003d23 20d9f0e8 ea9831a9 2759fb4b\n"
"\t%08X %08X %08X %08X %08X %08X %08X %08X\n",
ntohl(context.state[0]), ntohl(context.state[1]),
ntohl(context.state[2]), ntohl(context.state[3]),
ntohl(context.state[4]), ntohl(context.state[5]),
ntohl(context.state[6]), ntohl(context.state[7]));
sha256_init(&context);
sha256_update(&context, million, 1000);
sha256_final(&context, digest);
printf("\'\\0\'*1000\n"
"\t541b3e9d aa09b20b f85fa273 e5cbd3e8 0185aa4e c298e765 db87742b 70138a53\n"
"\t%08X %08X %08X %08X %08X %08X %08X %08X\n",
ntohl(context.state[0]), ntohl(context.state[1]),
ntohl(context.state[2]), ntohl(context.state[3]),
ntohl(context.state[4]), ntohl(context.state[5]),
ntohl(context.state[6]), ntohl(context.state[7]));
memset(million, 'A', 1000);
sha256_init(&context);
sha256_update(&context, million, 1000);
sha256_final(&context, digest);
printf("\'A\'*1000\n"
"\tc2e68682 3489ced2 017f6059 b8b23931 8b6364f6 dcd835d0 a519105a 1eadd6e4\n"
"\t%08X %08X %08X %08X %08X %08X %08X %08X\n",
ntohl(context.state[0]), ntohl(context.state[1]),
ntohl(context.state[2]), ntohl(context.state[3]),
ntohl(context.state[4]), ntohl(context.state[5]),
ntohl(context.state[6]), ntohl(context.state[7]));
memset(million, 'U', 1005);
sha256_init(&context);
sha256_update(&context, million, 1005);
sha256_final(&context, digest);
printf("\'U\'*1005\n"
"\tf4d62dde c0f3dd90 ea1380fa 16a5ff8d c4c54b21 740650f2 4afc4120 903552b0\n"
"\t%08X %08X %08X %08X %08X %08X %08X %08X\n",
ntohl(context.state[0]), ntohl(context.state[1]),
ntohl(context.state[2]), ntohl(context.state[3]),
ntohl(context.state[4]), ntohl(context.state[5]),
ntohl(context.state[6]), ntohl(context.state[7]));
memset(million, 'a', 1000000);
sha256_init(&context);
sha256_update(&context, million, 1000000);
sha256_final(&context, digest);
printf("\'a\'*1000000\n"
"\tcdc76e5c 9914fb92 81a1c7e2 84d73e67 f1809a48 a497200e 046d39cc c7112cd0\n"
"\t%08X %08X %08X %08X %08X %08X %08X %08X\n",
ntohl(context.state[0]), ntohl(context.state[1]),
ntohl(context.state[2]), ntohl(context.state[3]),
ntohl(context.state[4]), ntohl(context.state[5]),
ntohl(context.state[6]), ntohl(context.state[7]));
memset(million, 0, 1000000);
sha256_init(&context);
sha256_update(&context, million, 1000000);
sha256_final(&context, digest);
printf("\'\\0\'*1000000\n"
"\td29751f2 649b32ff 572b5e0a 9f541ea6 60a50f94 ff0beedf b0b692b9 24cc8025\n"
"\t%08X %08X %08X %08X %08X %08X %08X %08X\n",
ntohl(context.state[0]), ntohl(context.state[1]),
ntohl(context.state[2]), ntohl(context.state[3]),
ntohl(context.state[4]), ntohl(context.state[5]),
ntohl(context.state[6]), ntohl(context.state[7]));
t = clock();
do sha256_core(&context); while (--n);
t = clock() - t;
printf("%lu.%06lu seconds per GiB\n",
t / CLOCKS_PER_SEC, (t % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC);
}
Execution of this program on Matt Godbolt’s
compiler explorer
using GCC 13.2.0
x86-64 yields the following output:
[…] "" e3b0c442 98fc1c14 9afbf4c8 996fb924 27ae41e4 649b934c a495991b 7852b855 E3B0C442 98FC1C14 9AFBF4C8 996FB924 27AE41E4 649B934C A495991B 7852B855 "abc" ba7816bf 8f01cfea 414140de 5dae2223 b00361a3 96177a9c b410ff61 f20015ad BA7816BF 8F01CFEA 414140DE 5DAE2223 B00361A3 96177A9C B410FF61 F20015AD "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq" 248d6a61 d20638b8 e5c02693 0c3e6039 a33ce459 64ff2167 f6ecedd4 19db06c1 248D6A61 D20638B8 E5C02693 0C3E6039 A33CE459 64FF2167 F6ECEDD4 19DB06C1 "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu" cf5b16a7 78af8380 036ce59e 7b049237 0b249b11 e8f07a51 afac4503 7afee9d1 CF5B16A7 78AF8380 036CE59E 7B049237 0B249B11 E8F07A51 AFAC4503 7AFEE9D1 "\xBD" 68325720 aabd7c82 f30f554b 313d0570 c95accbb 7dc4b5aa e11204c0 8ffe732b 68325720 AABD7C82 F30F554B 313D0570 C95ACCBB 7DC4B5AA E11204C0 8FFE732B "\xC9\x8C\x8E\x55" 7abc22c0 ae5af26c e93dbb94 433a0e0b 2e119d01 4f8e7f65 bd56c61c cccd9504 7ABC22C0 AE5AF26C E93DBB94 433A0E0B 2E119D01 4F8E7F65 BD56C61C CCCD9504 '\0'*55 02779466 cdec1638 11d07881 5c633f21 90141308 1449002f 24aa3e80 f0b88ef7 02779466 CDEC1638 11D07881 5C633F21 90141308 1449002F 24AA3E80 F0B88EF7 '\0'*56 d4817aa5 497628e7 c77e6b60 6107042b bba31308 88c5f47a 375e6179 be789fbb D4817AA5 497628E7 C77E6B60 6107042B BBA31308 88C5F47A 375E6179 BE789FBB '\0'*57 65a16cb7 861335d5 ace3c607 18b5052e 44660726 da4cd13b b745381b 235a1785 65A16CB7 861335D5 ACE3C607 18B5052E 44660726 DA4CD13B B745381B 235A1785 '\0'*64 f5a5fd42 d16a2030 2798ef6e d309979b 43003d23 20d9f0e8 ea9831a9 2759fb4b F5A5FD42 D16A2030 2798EF6E D309979B 43003D23 20D9F0E8 EA9831A9 2759FB4B '\0'*1000 541b3e9d aa09b20b f85fa273 e5cbd3e8 0185aa4e c298e765 db87742b 70138a53 541B3E9D AA09B20B F85FA273 E5CBD3E8 0185AA4E C298E765 DB87742B 70138A53 'A'*1000 c2e68682 3489ced2 017f6059 b8b23931 8b6364f6 dcd835d0 a519105a 1eadd6e4 C2E68682 3489CED2 017F6059 B8B23931 8B6364F6 DCD835D0 A519105A 1EADD6E4 'U'*1005 f4d62dde c0f3dd90 ea1380fa 16a5ff8d c4c54b21 740650f2 4afc4120 903552b0 F4D62DDE C0F3DD90 EA1380FA 16A5FF8D C4C54B21 740650F2 4AFC4120 903552B0 'a'*1000000 cdc76e5c 9914fb92 81a1c7e2 84d73e67 f1809a48 a497200e 046d39cc c7112cd0 CDC76E5C 9914FB92 81A1C7E2 84D73E67 F1809A48 A497200E 046D39CC C7112CD0 '\0'*1000000 d29751f2 649b32ff 572b5e0a 9f541ea6 60a50f94 ff0beedf b0b692b9 24cc8025 D29751F2 649B32FF 572B5E0A 9F541EA6 60A50F94 FF0BEEDF B0B692B9 24CC8025 4.875124 seconds per GiBIn other units: 220.2 MB per second.
sha256_core()
avoids the inefficient rotation of the working variables
a to h via a (partially) unrolled third loop:
// Copyright © 2004-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>
void sha256_core(sha256_ctx *context)
{
unsigned int a, b, c, d, e, f, g, h, t, w[64];
t = 0;
do // load block into "message schedule" w
w[t] = ntohl(context->block[t]);
while (++t < 16);
do // expand "message schedule"
w[t] = w[t - 16] + SMALL_0(w[t - 15]) + w[t - 7] + SMALL_1(w[t - 2]);
while (++t < 64);
// load state into "working variables" a to h
a = context->state[0];
b = context->state[1];
c = context->state[2];
d = context->state[3];
e = context->state[4];
f = context->state[5];
g = context->state[6];
h = context->state[7];
t = 0;
do { // scramble "working variables"
h += SIGMA_1(e) + CH(e, f, g) + k[t] + w[t];
d += h;
h += SIGMA_0(a) + MAJ(a, b, c);
++t;
g += SIGMA_1(d) + CH(d, e, f) + k[t] + w[t];
c += g;
g += SIGMA_0(h) + MAJ(h, a, b);
++t;
f += SIGMA_1(c) + CH(c, d, e) + k[t] + w[t];
b += f;
f += SIGMA_0(g) + MAJ(g, h, a);
++t;
e += SIGMA_1(b) + CH(b, c, d) + k[t] + w[t];
a += e;
e += SIGMA_0(f) + MAJ(f, g, h);
++t;
d += SIGMA_1(a) + CH(a, b, c) + k[t] + w[t];
h += d;
d += SIGMA_0(e) + MAJ(e, f, g);
++t;
c += SIGMA_1(h) + CH(h, a, b) + k[t] + w[t];
g += c;
c += SIGMA_0(d) + MAJ(d, e, f);
++t;
b += SIGMA_1(g) + CH(g, h, a) + k[t] + w[t];
f += b;
b += SIGMA_0(c) + MAJ(c, d, e);
++t;
a += SIGMA_1(f) + CH(f, g, h) + k[t] + w[t];
e += a;
a += SIGMA_0(b) + MAJ(b, c, d);
} while (++t < 64);
// add "working variables" to state
context->state[0] += a;
context->state[1] += b;
context->state[2] += c;
context->state[3] += d;
context->state[4] += e;
context->state[5] += f;
context->state[6] += g;
context->state[7] += h;
}
Note: depending on compiler and processor, this
variant might be faster than the straightforward implementation, but
can also be slower!
Message Schedule
sha256_core()
coalesces the 3 loops into a single one to remove the dedicated
array w[64]
for the message scheduleW0 to W63 and saves 256 bytes (4 cache lines) by folding it onto the array
block[16]
of the SHA256_CTX
structure:
// Copyright © 2004-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>
void sha256_core(sha256_ctx *context)
{
unsigned int a = context->state[0],
b = context->state[1],
c = context->state[2],
d = context->state[3],
e = context->state[4],
f = context->state[5],
g = context->state[6],
h = context->state[7],
i, j,
t = 0; // load state into "working variables" a to h
do {
if (t < 16)
context->block[t] = ntohl(context->block[t]);
else
context->block[t & 15] += SMALL_0(context->block[t - 15 & 15])
+ context->block[t - 7 & 15]
+ SMALL_1(context->block[t - 2 & 15]);
j = SIGMA_0(a) + MAJ(a, b, c);
i = SIGMA_1(e) + CH(e, f, g) + h + k[t] + context->block[t & 15];
h = g; g = f; f = e; e = d + i;
d = c; c = b; b = a; a = i + j;
} while (++t < 64);
// add "working variables" to state
context->state[0] += a;
context->state[1] += b;
context->state[2] += c;
context->state[3] += d;
context->state[4] += e;
context->state[5] += f;
context->state[6] += g;
context->state[7] += h;
}
Note: combining both variants into one without
loop, i.e. completely unrolled, is left as an exercise to the
reader!
message scheduleW0 to W63 onto the array
block[16]
of the SHA256_CTX
structure,
uses the array state[8]
there instead of the 8 working
variables a to h to perform their rotation via
the identities
a = state[(64 - t) % 64] = state[-t & 63]
,
b = state[(65 - t) % 64] = state['b' - 'a' - t & 63]
etc., unrolls the coalesced loops and incorporates the constants
k0 to k63 as immediate
values into the instructions:
# Copyright © 2004-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>
# Common "cdecl" calling and naming convention for i386 platform:
# - arguments are pushed on stack in reverse order (from right to left),
# 4-byte aligned;
# - 64-bit integer arguments are passed as pair of 32-bit integer arguments,
# low part below high part;
# - 64-bit integer result is returned in registers EAX (low part) and
# EDX (high part);
# - 32-bit integer or pointer result is returned in register EAX;
# - registers EAX, ECX, EDX and XMM0 to XMM7 are volatile and can be
# clobbered;
# - registers EBX, ESP, EBP, ESI and EDI must be preserved;
# - function names are prefixed with an underscore.
.ident "Copyright (C) 2004-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>"
.file "sha-256.s"
.arch i486 # support BSWAP instruction
.code32
.att_syntax
.struct 0 # SHA256_CTX structure
state: # 8 double words
.space 8*4
count: # 1 quad word
.space 2*4
block: # 16 double words
.space 16*4
.macro sha256 k :req, t :req
.if \t < 16
movl block+4*\t(%ebp), %eax
bswapl %eax # eax = ntohl(block[t])
.else
movl block+4*((\t-15)&15)(%ebp), %eax
movl block+4*((\t-2)&15)(%ebp), %ebx
movl %eax, %ecx # eax = m = block[t-15&15]
movl %ebx, %edx # ebx = n = block[t-2&15]
shrl $3, %eax # eax = m >> 3
shrl $10, %ebx # ebx = n >> 10
rorl $7, %ecx # ecx = ROTR(m, 7)
rorl $17, %edx # edx = ROTR(n, 17)
xorl %ecx, %eax # eax = (m >> 3) ^ ROTR(m, 7)
xorl %edx, %ebx # ebx = (n >> 10) ^ ROTR(n, 17)
rorl $(18-7), %ecx # ecx = ROTR(m, 18)
rorl $(19-17), %edx # edx = ROTR(n, 19)
xorl %ecx, %eax # eax = (m >> 3) ^ ROTR(m, 7) ^ ROTR(m, 18)
# = SMALL_0(m)
xorl %edx, %ebx # ebx = (n >> 10) ^ ROTR(n, 17) ^ ROTR(n, 19)
# = SMALL_1(n)
addl block+4*((\t-16)&15)(%ebp), %eax
addl block+4*((\t-7)&15)(%ebp), %ebx
addl %ebx, %eax # eax = SMALL_0(block[t-15&15]) + block[t-16&15]
# + SMALL_1(block[t-2&15]) + block[t-7&15]
.endif
movl %eax, block+4*((\t)&15)(%ebp)
# block[t&15] = (t < 16)
# ? ntohl(block[t])
# : SMALL_0(block[t-15&15]) + block[t-16&15]
# + SMALL_1(block[t-2&15]) + block[t-7&15]
addl state+4*((71-\t)&7)(%ebp), %eax
# eax = block[t&15] + state[71-t&7]
# = block[t&15] + h
movl state+4*((70-\t)&7)(%ebp), %ebx
# ebx = g = state[70-t&7]
movl state+4*((69-\t)&7)(%ebp), %ecx
# ecx = f = state[69-t&7]
movl state+4*((68-\t)&7)(%ebp), %edx
# edx = e = state[68-t&7]
xorl %ebx, %ecx # ecx = f ^ g
andl %edx, %ecx # ecx = e & (f ^ g)
xorl %ebx, %ecx # ecx = e & (f ^ g) ^ g
# = CH(e, f, g)
addl %ecx, %eax # eax = block[t&15] + CH(e, f, g) + h
movl %edx, %ebx # ebx = e
rorl $6, %edx # edx = ROTR(e, 6)
rorl $11, %ebx # ebx = ROTR(e, 11)
xorl %ebx, %edx # edx = ROTR(e, 6) ^ ROTR(e, 11)
rorl $(25-11), %ebx # ebx = ROTR(e, 25)
xorl %edx, %ebx # ebx = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25)
# = SIGMA_1(e)
leal \k(%eax, %ebx), %eax # eax = SIGMA_1(e) + CH(e, f, g) + h + k[t] + block[t&15]
# = T1
movl state+4*((64-\t)&7)(%ebp), %ebx
movl state+4*((65-\t)&7)(%ebp), %ecx
# ecx = b
movl %ebx, %edx # edx = a
xorl %ecx, %ebx # ebx = a ^ b
andl %edx, %ecx # ecx = a & b
andl state+4*((66-\t)&7)(%ebp), %ebx
# ebx = (a ^ b) & c
addl %eax, state+4*((67-\t)&7)(%ebp)
# d' = d + T1
orl %ecx, %ebx # ebx = (a & b) | ((a ^ b) & c)
# = MAJ(a, b, c)
addl %ebx, %eax # eax = T1 + MAJ(a, b, c)
movl %edx, %ecx # ecx = a
rorl $2, %edx # edx = ROTR(a, 2)
rorl $13, %ecx # ecx = ROTR(a, 13)
xorl %ecx, %edx # edx = ROTR(a, 2) ^ ROTR(a, 13)
rorl $(22-13), %ecx # ecx = ROTR(a, 22)
xorl %ecx, %edx # edx = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22)
# = SIGMA_0(a)
addl %edx, %eax # eax = T1 + T2
movl %eax, state+4*((63-\t)&7)(%ebp)
# h' = T1 + T2
.endm
.text
sha256_core: # void SHA256_Core(SHA256_CTX *context)
pushl %ebp
movl 8(%esp), %ebp # ebp = address of context structure
pushl %ebx
# save state
pushl state(%ebp)
pushl state+4(%ebp)
pushl state+8(%ebp)
pushl state+12(%ebp)
pushl state+16(%ebp)
pushl state+20(%ebp)
pushl state+24(%ebp)
pushl state+28(%ebp)
# calculate 64 rounds
sha256 0x428A2F98, 0
sha256 0x71374491, 1
sha256 0xB5C0FBCF, 2
sha256 0xE9B5DBA5, 3
sha256 0x3956C25B, 4
sha256 0x59F111F1, 5
sha256 0x923F82A4, 6
sha256 0xAB1C5ED5, 7
sha256 0xD807AA98, 8
sha256 0x12835B01, 9
sha256 0x243185BE, 10
sha256 0x550C7DC3, 11
sha256 0x72BE5D74, 12
sha256 0x80DEB1FE, 13
sha256 0x9BDC06A7, 14
sha256 0xC19BF174, 15
sha256 0xE49B69C1, 16
sha256 0xEFBE4786, 17
sha256 0x0FC19DC6, 18
sha256 0x240CA1CC, 19
sha256 0x2DE92C6F, 20
sha256 0x4A7484AA, 21
sha256 0x5CB0A9DC, 22
sha256 0x76F988DA, 23
sha256 0x983E5152, 24
sha256 0xA831C66D, 25
sha256 0xB00327C8, 26
sha256 0xBF597FC7, 27
sha256 0xC6E00BF3, 28
sha256 0xD5A79147, 29
sha256 0x06CA6351, 30
sha256 0x14292967, 31
sha256 0x27B70A85, 32
sha256 0x2E1B2138, 33
sha256 0x4D2C6DFC, 34
sha256 0x53380D13, 35
sha256 0x650A7354, 36
sha256 0x766A0ABB, 37
sha256 0x81C2C92E, 38
sha256 0x92722C85, 39
sha256 0xA2BFE8A1, 40
sha256 0xA81A664B, 41
sha256 0xC24B8B70, 42
sha256 0xC76C51A3, 43
sha256 0xD192E819, 44
sha256 0xD6990624, 45
sha256 0xF40E3585, 46
sha256 0x106AA070, 47
sha256 0x19A4C116, 48
sha256 0x1E376C08, 49
sha256 0x2748774C, 50
sha256 0x34B0BCB5, 51
sha256 0x391C0CB3, 52
sha256 0x4ED8AA4A, 53
sha256 0x5B9CCA4F, 54
sha256 0x682E6FF3, 55
sha256 0x748F82EE, 56
sha256 0x78A5636F, 57
sha256 0x84C87814, 58
sha256 0x8CC70208, 59
sha256 0x90BEFFFA, 60
sha256 0xA4506CEB, 61
sha256 0xBEF9A3F7, 62
sha256 0xC67178F2, 63
# add saved state
popl %eax
addl %eax, state+28(%ebp)
popl %ebx
addl %ebx, state+24(%ebp)
popl %ecx
addl %ecx, state+20(%ebp)
popl %edx
addl %edx, state+16(%ebp)
popl %eax
addl %eax, state+12(%ebp)
popl %ebx
addl %ebx, state+8(%ebp)
popl %ecx
addl %ecx, state+4(%ebp)
popl %edx
addl %edx, state(%ebp)
popl %ebx
popl %ebp
retl
.global sha256_core
.size sha256_core, .-sha256_core
.type sha256_core, @function
sha256_final: # void SHA256_Final(SHA256_CTX *context,
# unsigned char digest[32])
movl 4(%esp), %edx # edx = address of context structure
movl count(%edx), %ecx # ecx = low double word of count
andl $63, %ecx # ecx = number of bytes in block
# = index of first free byte in block
pushl %edi
leal block(%edx, %ecx), %edi # edi = address of first free byte in block
pushl %edi
.Lpad_1:
movb $0b10000000, %al
stosb # block[index] = 0b10000000
.Lpad_0:
xorl %eax, %eax # eax = 0
xorl $63, %ecx # ecx = number of free bytes in block - 1
# = 63 - index
rep stosb # block[index + 1, 63] = 0
subl $8, %edi # edi = address of last quad word in block
popl %eax # eax = address of first free byte in block
cmpl %eax, %edi
ja .Lpad_count # space for count available in block?
# index < 56?
pushl %edx
calll sha256_core
popl %edx
.Lpad_block:
movl %edi, %ecx # ecx = address of last quad word in block
leal block(%edx), %edi # edi = address of block
xorl %eax, %eax # eax = 0
subl %edi, %ecx # ecx = number of bytes before last quad word
# = 56
rep stosb # block[0, 55] = 0,
# edi = address of last quad word in block
.Lpad_count:
movl count(%edx), %ecx
movl count+4(%edx), %eax # eax:ecx = count
shldl $3, %ecx, %eax
shll $3, %ecx # eax:ecx = count * 8
# = number of message bits
bswapl %eax
bswapl %ecx # eax:ecx = htonll(number of message bits)
stosl
movl %ecx, %eax
stosl # block[56, 63] = htonll(number of message bits)
pushl %edx
calll sha256_core
popl %edx
xchgl %edx, %esi # esi = address of state
movl 12(%esp), %edi # edi = address of digest
movl $8, %ecx # ecx = number of double words
.Ldigest:
lodsl
.ifdef ALIGNED
bswapl %eax
stosl
.else
roll $8, %eax
stosb
roll $8, %eax
stosb
roll $8, %eax
stosb
roll $8, %eax
stosb
.endif
decl %ecx
jnz .Ldigest
movl %edx, %esi
popl %edi
retl
.global sha256_final
.size sha256_final, .-sha256_final
.type sha256_final, @function
sha256_init: # void SHA256_Init(SHA256_CTX *context)
movl %edi, %edx
movl 4(%esp), %edi # edi = address of context structure
movl 0x6A09E667, %eax # eax = H0
stosl # state[0] = H0
movl 0xBB67AE85, %eax # eax = H1
stosl # state[1] = H1
movl 0x3C6EF372, %eax # eax = H2
stosl # state[2] = H2
movl 0xA54FF53A, %eax # eax = H3
stosl # state[3] = H3
movl 0x510E527F, %eax # eax = H4
stosl # state[4] = H4
movl 0x9B05688C, %eax # eax = H5
stosl # state[5] = H5
movl 0x1F83D9AB, %eax # eax = H6
stosl # state[6] = H6
movl 0x5BE0CD19, %eax # eax = H7
stosl # state[7] = H7
xorl %eax, %eax # eax = 0
stosl
stosl # count = 0
movl %edx, %edi
retl
.global sha256_init
.size sha256_init, .-sha256_init
.type sha256_init, @function
sha256_update: # void SHA256_Update(SHA256_CTX *context,
# void const *data,
# unsigned int size)
movl 12(%esp), %eax # eax = number of bytes in data
testl %eax, %eax
jz .Lnone # no data?
movl 4(%esp), %edx # edx = address of context structure
movl count(%edx), %ecx # ecx = low double word of count
andl $63, %ecx # ecx = number of bytes in block
# = index of first free byte in block
addl %eax, count(%edx)
adcl $0, count+4(%edx) # count += number of bytes in data
pushl %esi
movl 12(%esp), %esi # esi = address of data
pushl %edi
.Ldata:
leal block(%edx, %ecx), %edi # edi = address of first free byte in block
xorl $63, %ecx
incl %ecx # ecx = number of free bytes in block
subl %ecx, %eax # eax = number of bytes in data
# - number of free bytes in block
jb .Llast # number of bytes in data < number of free bytes in block?
.Lmore:
rep movsb # esi = address of remaining data
movl %eax, %edi
pushl %edx
calll sha256_core
popl %edx
movl %edi, %eax
xorl %ecx, %ecx # ecx = 0 = index of first free byte in block
testl %eax, %eax
jnz .Ldata # more data?
popl %edi
popl %esi
.Lnone:
retl
.Llast:
addl %eax, %ecx # ecx = number of bytes in data
rep movsb
popl %edi
popl %esi
retl
.global sha256_update
.size sha256_update, .-sha256_update
.type sha256_update, @function
.end
#include
the following
ANSI C
header file in your
ANSI C
sources:
// Copyleft © 2004-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>
typedef struct _sha256_ctx {
unsigned int state[8], count[2], block[16];
} sha256_ctx;
extern void sha256_core(sha256_ctx *context);
extern void sha256_final(sha256_ctx *context, unsigned char digest[32]);
extern void sha256_init(sha256_ctx *context);
extern void sha256_update(sha256_ctx *context, void const *data, unsigned int size);
; Copyright © 2004-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>
.486 ; support BSWAP instruction
.model flat, C
core textequ <SHA256_Core>
final textequ <SHA256_Final>
init textequ <SHA256_Init>
update textequ <SHA256_Update>
context struct 4 ; SHA256_CTX structure
state dword 8 dup (?) ; also "working variables" a to h
count dword 2 dup (?)
block dword 16 dup (?) ; also "message schedule" W'[16]
context ends
; in order to fold the message schedule W[64] to W'[16] alias block[16],
; W[i] becomes W'[i % 16] alias block[i & 15]
; in order to rotate the 8 working variables a to h through state[8],
; a becomes state[(64 - t) % 8] alias state[-t & 7],
; b becomes state[(64 + 'b' - 'a' - t) % 8] alias state['b' - 'a' - t & 7],
; etc.
block macro l :req
exitm @CatStr(<context.block[ebp+(>, &l, <) mod lengthof context.block * 4]>)
endm
state macro l :req
exitm @CatStr(<context.state[ebp+(>, &l, <) mod lengthof context.state * 4]>)
endm
sha256 macro k :req, t :req
.erre 4 and (opattr &k) and (opattr &t)
.errnz -64 and &t
if &t lt 16
mov eax, block(&t) ;; eax = block[t]
if @Cpu and 16
bswap eax ;; eax = ntohl(block[t])
else
xchg ah, al
ror eax, 16
xchg ah, al
endif
else ; &t ge 16
mov eax, block(&t-15) ;; eax = m = block[t - 15 & 15]
mov ebx, block(&t-2) ;; ebx = n = block[t - 2 & 15]
mov ecx, eax ;; ecx = m
mov edx, ebx ;; edx = n
shr eax, 3 ;; eax = m >> 3
shr ebx, 10 ;; ebx = n >> 10
ror ecx, 7 ;; ecx = ROTR(m, 7)
ror edx, 17 ;; edx = ROTR(n, 17)
xor eax, ecx ;; eax = (m >> 3) ^ ROTR(m, 7)
xor ebx, edx ;; ebx = (n >> 10) ^ ROTR(n, 17)
ror ecx, 18-7 ;; ecx = ROTR(m, 18)
ror edx, 19-17 ;; edx = ROTR(n, 19)
xor eax, ecx ;; eax = (m >> 3) ^ ROTR(m, 7) ^ ROTR(m, 18)
;; = SMALL_0(m)
xor ebx, edx ;; ebx = (n >> 10) ^ ROTR(n, 17) ^ ROTR(n, 19)
;; = SMALL_1(n)
add eax, block(&t-16) ;; eax = SMALL_0(m) + block[t - 16 & 15]
add ebx, block(&t-7) ;; ebx = SMALL_1(n) + block[t - 7 & 15]
add eax, ebx ;; eax = SMALL_0(m) + block[t - 16 & 15]
;; + SMALL_1(n) + block[t - 7 & 15]
endif ; &t ge 16
mov block(&t), eax ;; block[t & 15] = (t < 16)
;; ? ntohl(block[t])
;; : SMALL_0(m) + block[t - 16 & 15]
;; + SMALL_1(n) + block[t - 7 & 15]
add eax, state(71-&t) ;; eax = block[t & 15] + state[71 - t & 7]
;; = block[t & 15] + h
mov ebx, state(70-&t) ;; ebx = g = state[70 - t & 7]
mov ecx, state(69-&t) ;; ecx = f = state[69 - t & 7]
mov edx, state(68-&t) ;; edx = e = state[68 - t & 7]
xor ecx, ebx ;; ecx = f ^ g
and ecx, edx ;; ecx = e & (f ^ g)
xor ecx, ebx ;; ecx = e & (f ^ g) ^ g
;; = CH(e, f, g)
add eax, ecx ;; eax = block[t & 15] + CH(e, f, g) + h
mov ebx, edx ;; ebx = e
ror edx, 6 ;; edx = ROTR(e, 6)
ror ebx, 11 ;; ebx = ROTR(e, 11)
xor edx, ebx ;; edx = ROTR(e, 6) ^ ROTR(e, 11)
ror ebx, 25-11 ;; ebx = ROTR(e, 25)
xor ebx, edx ;; ebx = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25)
;; = SIGMA_1(e)
lea eax, &k[eax+ebx] ;; eax = SIGMA_1(e) + CH(e, f, g) + h + k[t] + block[t & 15]
;; = T1
mov ebx, state(64-&t) ;; ebx = a = state[64 - t & 7]
mov ecx, state(65-&t) ;; ecx = b = state[65 - t & 7]
mov edx, ebx ;; edx = a
xor ebx, ecx ;; ebx = a ^ b
and ecx, edx ;; ecx = a & b
and ebx, state(66-&t) ;; ebx = (a ^ b) & c
add state(67-&t), eax ;; d' = d + T1
or ebx, ecx ;; ebx = (a & b) | ((a ^ b) & c)
;; = MAJ(a, b, c)
add eax, ebx ;; eax = T1 + MAJ(a, b, c)
mov ecx, edx ;; ecx = a
ror edx, 2 ;; edx = ROTR(a, 2)
ror ecx, 13 ;; ecx = ROTR(a, 13)
xor edx, ecx ;; edx = ROTR(a, 2) ^ ROTR(a, 13)
ror ecx, 22-13 ;; ecx = ROTR(a, 22)
xor edx, ecx ;; edx = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22)
;; = SIGMA_0(a)
add eax, edx ;; eax = T1 + T2
mov state(63-&t), eax ;; state[63 - t & 7] = h' = T1 + T2
endm
.code
core proc public ; void SHA256_Core(SHA256_CTX *context)
push ebp
mov ebp, [esp+8] ; ebp = address of context structure
assume ebp :ptr context
push ebx
; save state
push context.state[ebp]
push context.state[ebp+4]
push context.state[ebp+8]
push context.state[ebp+12]
push context.state[ebp+16]
push context.state[ebp+20]
push context.state[ebp+24]
push context.state[ebp+28]
; calculate 64 rounds
sha256 0428A2F98h, 0
sha256 071374491h, 1
sha256 0B5C0FBCFh, 2
sha256 0E9B5DBA5h, 3
sha256 03956C25Bh, 4
sha256 059F111F1h, 5
sha256 0923F82A4h, 6
sha256 0AB1C5ED5h, 7
sha256 0D807AA98h, 8
sha256 012835B01h, 9
sha256 0243185BEh, 10
sha256 0550C7DC3h, 11
sha256 072BE5D74h, 12
sha256 080DEB1FEh, 13
sha256 09BDC06A7h, 14
sha256 0C19BF174h, 15
sha256 0E49B69C1h, 16
sha256 0EFBE4786h, 17
sha256 00FC19DC6h, 18
sha256 0240CA1CCh, 19
sha256 02DE92C6Fh, 20
sha256 04A7484AAh, 21
sha256 05CB0A9DCh, 22
sha256 076F988DAh, 23
sha256 0983E5152h, 24
sha256 0A831C66Dh, 25
sha256 0B00327C8h, 26
sha256 0BF597FC7h, 27
sha256 0C6E00BF3h, 28
sha256 0D5A79147h, 29
sha256 006CA6351h, 30
sha256 014292967h, 31
sha256 027B70A85h, 32
sha256 02E1B2138h, 33
sha256 04D2C6DFCh, 34
sha256 053380D13h, 35
sha256 0650A7354h, 36
sha256 0766A0ABBh, 37
sha256 081C2C92Eh, 38
sha256 092722C85h, 39
sha256 0A2BFE8A1h, 40
sha256 0A81A664Bh, 41
sha256 0C24B8B70h, 42
sha256 0C76C51A3h, 43
sha256 0D192E819h, 44
sha256 0D6990624h, 45
sha256 0F40E3585h, 46
sha256 0106AA070h, 47
sha256 019A4C116h, 48
sha256 01E376C08h, 49
sha256 02748774Ch, 50
sha256 034B0BCB5h, 51
sha256 0391C0CB3h, 52
sha256 04ED8AA4Ah, 53
sha256 05B9CCA4Fh, 54
sha256 0682E6FF3h, 55
sha256 0748F82EEh, 56
sha256 078A5636Fh, 57
sha256 084C87814h, 58
sha256 08CC70208h, 59
sha256 090BEFFFAh, 60
sha256 0A4506CEBh, 61
sha256 0BEF9A3F7h, 62
sha256 0C67178F2h, 63
; add saved state
pop eax
add context.state[ebp+28], eax
pop ebx
add context.state[ebp+24], ebx
pop ecx
add context.state[ebp+20], ecx
pop edx
add context.state[ebp+16], edx
pop eax
add context.state[ebp+12], eax
pop ebx
add context.state[ebp+8], ebx
pop ecx
add context.state[ebp+4], ecx
pop edx
add context.state[ebp], edx
pop ebx
pop ebp
ret
core endp
final proc public ; void SHA256_Final(SHA256_CTX *context,
; unsigned char digest[32])
assume edx :ptr context
mov edx, [esp+4] ; edx = address of context structure
mov ecx, context.count[edx] ; ecx = low dword of count
and ecx, sizeof context.block - 1
; ecx = number of bytes in block
; = index of first free byte in block
push edi
lea edi, context.block[edx+ecx]
push edi ; edi = address of first free byte in block
pad_1:
mov al, 10000000y
stosb ; block[index] = 0b10000000
pad_0:
xor eax, eax ; eax = 0
xor ecx, sizeof context.block - 1
; ecx = number of free bytes in block - 1
; = 63 - index
rep stosb ; block[index + 1, 63] = 0
sub edi, sizeof context.count
; edi = address of last qword in block
pop eax ; eax = address of first free byte in block
cmp eax, edi
jb short pad_count ; space for count available in block?
; index < 56?
push edx
call core
pop edx
pad_block:
mov ecx, edi ; ecx = address of last qword in block
lea edi, context.block[edx] ; edi = address of block
xor eax, eax ; eax = 0
sub ecx, edi ; ecx = number of bytes before last qword
; = 56
rep stosb ; block[0, 55] = 0,
; edi = address of last qword in block
pad_count:
mov eax, context.count[edx+4]
mov ecx, context.count[edx] ; eax:ecx = count
shld eax, ecx, 3
shl ecx, 3 ; eax:ecx = count * 8
; = number of message bits
if @Cpu and 16
bswap eax
bswap ecx ; eax:ecx = htonll(number of message bits)
else
xchg ah, al
xchg ch, cl
ror eax, 16
ror ecx, 16
xchg ah, al
xchg ch, cl
endif
stosd
mov eax, ecx
stosd ; block[56, 63] = htonll(number of message bits)
push edx
call core
pop edx
xchg esi, edx ; esi = address of state
mov edi, [esp+12] ; edi = address of digest
mov ecx, lengthof context.state
digest:
lodsd
ifndef ALIGNED
rol eax, 8
stosb
rol eax, 8
stosb
rol eax, 8
stosb
rol eax, 8
stosb
elseif @Cpu and 16
bswap eax
stosd
else
xchg ah, al
ror eax, 16
xchg ah, al
stosd
endif ; ALIGNED
dec ecx
jnz short digest
mov esi, edx
pop edi
ret
final endp
init proc public ; void SHA256_Init(SHA256_CTX *context)
mov edx, edi
mov edi, [esp+4] ; edi = address of context structure
mov eax, 06A09E667h ; eax = H0
stosd ; state[0] = H0
mov eax, 0BB67AE85h ; eax = H1
stosd ; state[1] = H1
mov eax, 03C6EF372h ; eax = H2
stosd ; state[2] = H2
mov eax, 0A54FF53Ah ; eax = H3
stosd ; state[3] = H3
mov eax, 0510E527Fh ; eax = H4
stosd ; state[4] = H4
mov eax, 09B05688Ch ; eax = H5
stosd ; state[5] = H5
mov eax, 01F83D9ABh ; eax = H6
stosd ; state[6] = H6
mov eax, 05BE0CD19h ; eax = H7
stosd ; state[7] = H7
xor eax, eax ; eax = 0
stosd
stosd ; count = 0
mov edi, edx
ret
init endp
update proc public ; void SHA256_Update(SHA256_CTX *context,
; void const *data,
; unsigned int size)
mov eax, [esp+12] ; eax = number of bytes in data
test eax, eax
jz short none ; no data?
assume edx :ptr context
mov edx, [esp+4] ; edx = address of context structure
mov ecx, context.count[edx] ; ecx = low dword of count
and ecx, sizeof context.block - 1
; ecx = number of bytes in block
; = index of first free byte in block
add context.count[edx], eax
adc context.count[edx+4], 0 ; count += number of bytes in data
push esi
mov esi, [esp+12] ; esi = address of data
push edi
data:
lea edi, context.block[edx+ecx]
; edi = address of first free byte in block
xor ecx, sizeof context.block - 1
inc ecx ; ecx = number of free bytes in block
sub eax, ecx ; eax = number of bytes in data
; - number of free bytes in block
jb short last ; number of bytes in data < number of free bytes in block?
more:
rep movsb ; esi = address of remaining data
mov edi, eax
push edx
call core
pop edx
mov eax, edi
xor ecx, ecx ; ecx = 0 = index of first free byte in block
test eax, eax
jnz short data ; more data?
pop edi
pop esi
none:
ret
last:
add ecx, eax ; ecx = number of bytes in data
rep movsb
pop edi
pop esi
ret
update endp
end
Note: the function SHA256_Core()
has
2990 instructions in 7825 bytes.
sha-256.asm
in an arbitrary, preferable empty
directory, then execute the following 2 command lines to generate
the 32-bit object file sha-256.obj
:
SET ML=/c /safeseh /W3 /X ML.EXE /DALIGNED sha-256.asmFor details and reference see the MSDN article ML and ML64 Command-Line Reference.
Note: if necessary, see the MSDN article Use the Microsoft C++ toolset from the command line for an introduction.
Note: the command lines can be copied and pasted as block into a Command Processor window.
Microsoft (R) Macro Assembler Version 10.00.40219.01 Copyright (C) Microsoft Corporation. All rights reserved. Assembling: sha-256.asm
// Copyright © 2004-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>
#define STRICT
#define UNICODE
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
typedef struct _SHA256_CTX
{
DWORD State[8];
DWORD Count[2];
DWORD Block[16];
} SHA256_CTX;
VOID CDECL SHA256_Core(SHA256_CTX *Context);
VOID CDECL SHA256_Final(SHA256_CTX *Context, BYTE Digest[32]);
VOID CDECL SHA256_Init(SHA256_CTX *Context);
VOID CDECL SHA256_Update(SHA256_CTX *Context, LPCVOID Data, DWORD Size);
#ifndef _M_IX86
#define __edivmodu(N, D) (DWORD) ((N) / (D)), (DWORD) ((N) % (D))
#else
__forceinline // companion for __emulu()
struct
{
DWORD ulQuotient, ulRemainder;
} CDECL __edivmodu(DWORD64 ullDividend, DWORD ulDivisor)
{
__asm mov eax, dword ptr ullDividend
__asm mov edx, dword ptr ullDividend+4
__asm div ulDivisor
}
#endif // _M_IX86
__declspec(safebuffers)
BOOL CDECL PrintConsole(HANDLE hConsole, [SA_FormatString(Style="printf")] LPCWSTR lpFormat, ...)
{
WCHAR szOutput[1024];
DWORD dwOutput;
DWORD dwConsole;
va_list vaInput;
va_start(vaInput, lpFormat);
dwOutput = wvsprintf(szOutput, lpFormat, vaInput);
va_end(vaInput);
if ((dwOutput == 0)
|| !WriteConsole(hConsole, szOutput, dwOutput, &dwConsole, NULL))
return FALSE;
return dwConsole == dwOutput;
}
__declspec(noreturn)
VOID CDECL wmainCRTStartup(VOID)
{
SHA256_CTX Context;
BYTE cbDigest[32], cbMillion[1000000];
DWORD dwCPUID[12];
DWORD dwError = ERROR_SUCCESS;
DWORD dwThread = 1000000000 / 64;
DWORD64 qwThread[2];
HANDLE hThread = GetCurrentThread();
HANDLE hConsole = GetStdHandle(STD_ERROR_HANDLE);
if (hConsole == INVALID_HANDLE_VALUE)
dwError = GetLastError();
else
{
__cpuid(dwCPUID, 0x80000000);
if (*dwCPUID >= 0x80000004)
{
__cpuid(dwCPUID, 0x80000002);
__cpuid(dwCPUID + 4, 0x80000003);
__cpuid(dwCPUID + 8, 0x80000004);
}
else
__movsb(dwCPUID, "unidentified processor", sizeof("unidentified processor"));
if (SetThreadIdealProcessor(hThread, 0) == -1)
PrintConsole(hConsole,
L"SetThreadIdealProcessor() returned error %lu\n",
dwError = GetLastError());
if (!SetThreadPriority(hThread, THREAD_PRIORITY_HIGHEST))
PrintConsole(hConsole,
L"SetThreadPriority() returned error %lu\n",
dwError = GetLastError());
PrintConsole(hConsole, L"\nTesting SHA-256 implementation...\n");
SHA256_Init(&Context);
SHA256_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"\"\n"
L"\te3b0c442 98fc1c14 9afbf4c8 996fb924 27ae41e4 649b934c a495991b 7852b855\n"
L"\t%08lX %08lX %08lX %08lX %08lX %08lX %08lX %08lX\n",
Context.State[0], Context.State[1], Context.State[2], Context.State[3],
Context.State[4], Context.State[5], Context.State[6], Context.State[7]);
SHA256_Init(&Context);
SHA256_Update(&Context, "abc", 3);
SHA256_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"abc\"\n"
L"\tba7816bf 8f01cfea 414140de 5dae2223 b00361a3 96177a9c b410ff61 f20015ad\n"
L"\t%08lX %08lX %08lX %08lX %08lX %08lX %08lX %08lX\n",
Context.State[0], Context.State[1], Context.State[2], Context.State[3],
Context.State[4], Context.State[5], Context.State[6], Context.State[7]);
SHA256_Init(&Context);
SHA256_Update(&Context, "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq",
sizeof("abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq") - 1);
SHA256_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq\"\n"
L"\t248d6a61 d20638b8 e5c02693 0c3e6039 a33ce459 64ff2167 f6ecedd4 19db06c1\n"
L"\t%08lX %08lX %08lX %08lX %08lX %08lX %08lX %08lX\n",
Context.State[0], Context.State[1], Context.State[2], Context.State[3],
Context.State[4], Context.State[5], Context.State[6], Context.State[7]);
SHA256_Init(&Context);
SHA256_Update(&Context, "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu",
sizeof("abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu") - 1);
SHA256_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu\"\n"
L"\tcf5b16a7 78af8380 036ce59e 7b049237 0b249b11 e8f07a51 afac4503 7afee9d1\n"
L"\t%08lX %08lX %08lX %08lX %08lX %08lX %08lX %08lX\n",
Context.State[0], Context.State[1], Context.State[2], Context.State[3],
Context.State[4], Context.State[5], Context.State[6], Context.State[7]);
SHA256_Init(&Context);
SHA256_Update(&Context, "\xBD", 1);
SHA256_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"\\xBD\"\n"
L"\t68325720 aabd7c82 f30f554b 313d0570 c95accbb 7dc4b5aa e11204c0 8ffe732b\n"
L"\t%08lX %08lX %08lX %08lX %08lX %08lX %08lX %08lX\n",
Context.State[0], Context.State[1], Context.State[2], Context.State[3],
Context.State[4], Context.State[5], Context.State[6], Context.State[7]);
SHA256_Init(&Context);
SHA256_Update(&Context, "\xC9\x8C\x8E\x55", 4);
SHA256_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"\\xC9\\x8C\\x8E\\x55\"\n"
L"\t7abc22c0 ae5af26c e93dbb94 433a0e0b 2e119d01 4f8e7f65 bd56c61c cccd9504\n"
L"\t%08lX %08lX %08lX %08lX %08lX %08lX %08lX %08lX\n",
Context.State[0], Context.State[1], Context.State[2], Context.State[3],
Context.State[4], Context.State[5], Context.State[6], Context.State[7]);
__stosb(cbMillion, 0, 1000);
SHA256_Init(&Context);
SHA256_Update(&Context, cbMillion, 55);
SHA256_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\'\\0\'×55\n"
L"\t02779466 cdec1638 11d07881 5c633f21 90141308 1449002f 24aa3e80 f0b88ef7\n"
L"\t%08lX %08lX %08lX %08lX %08lX %08lX %08lX %08lX\n",
Context.State[0], Context.State[1], Context.State[2], Context.State[3],
Context.State[4], Context.State[5], Context.State[6], Context.State[7]);
SHA256_Init(&Context);
SHA256_Update(&Context, cbMillion, 56);
SHA256_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\'\\0\'×56\n"
L"\td4817aa5 497628e7 c77e6b60 6107042b bba31308 88c5f47a 375e6179 be789fbb\n"
L"\t%08lX %08lX %08lX %08lX %08lX %08lX %08lX %08lX\n",
Context.State[0], Context.State[1], Context.State[2], Context.State[3],
Context.State[4], Context.State[5], Context.State[6], Context.State[7]);
SHA256_Init(&Context);
SHA256_Update(&Context, cbMillion, 57);
SHA256_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\'\\0\'×57\n"
L"\t65a16cb7 861335d5 ace3c607 18b5052e 44660726 da4cd13b b745381b 235a1785\n"
L"\t%08lX %08lX %08lX %08lX %08lX %08lX %08lX %08lX\n",
Context.State[0], Context.State[1], Context.State[2], Context.State[3],
Context.State[4], Context.State[5], Context.State[6], Context.State[7]);
SHA256_Init(&Context);
SHA256_Update(&Context, cbMillion, 64);
SHA256_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\'\\0\'×64\n"
L"\tf5a5fd42 d16a2030 2798ef6e d309979b 43003d23 20d9f0e8 ea9831a9 2759fb4b\n"
L"\t%08lX %08lX %08lX %08lX %08lX %08lX %08lX %08lX\n",
Context.State[0], Context.State[1], Context.State[2], Context.State[3],
Context.State[4], Context.State[5], Context.State[6], Context.State[7]);
SHA256_Init(&Context);
SHA256_Update(&Context, cbMillion, 1000);
SHA256_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\'\\0\'×1000\n"
L"\t541b3e9d aa09b20b f85fa273 e5cbd3e8 0185aa4e c298e765 db87742b 70138a53\n"
L"\t%08lX %08lX %08lX %08lX %08lX %08lX %08lX %08lX\n",
Context.State[0], Context.State[1], Context.State[2], Context.State[3],
Context.State[4], Context.State[5], Context.State[6], Context.State[7]);
__stosb(cbMillion, 'A', 1000);
SHA256_Init(&Context);
SHA256_Update(&Context, cbMillion, 1000);
SHA256_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"A…A\"\n"
L"\tc2e68682 3489ced2 017f6059 b8b23931 8b6364f6 dcd835d0 a519105a 1eadd6e4\n"
L"\t%08lX %08lX %08lX %08lX %08lX %08lX %08lX %08lX\n",
Context.State[0], Context.State[1], Context.State[2], Context.State[3],
Context.State[4], Context.State[5], Context.State[6], Context.State[7]);
__stosb(cbMillion, 'U', 1005);
SHA256_Init(&Context);
SHA256_Update(&Context, cbMillion, 1005);
SHA256_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"U…U\"\n"
L"\tf4d62dde c0f3dd90 ea1380fa 16a5ff8d c4c54b21 740650f2 4afc4120 903552b0\n"
L"\t%08lX %08lX %08lX %08lX %08lX %08lX %08lX %08lX\n",
Context.State[0], Context.State[1], Context.State[2], Context.State[3],
Context.State[4], Context.State[5], Context.State[6], Context.State[7]);
__stosb(cbMillion, 'a', 1000000);
SHA256_Init(&Context);
SHA256_Update(&Context, cbMillion, 1000000);
SHA256_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"a…a\"\n"
L"\tcdc76e5c 9914fb92 81a1c7e2 84d73e67 f1809a48 a497200e 046d39cc c7112cd0\n"
L"\t%08lX %08lX %08lX %08lX %08lX %08lX %08lX %08lX\n",
Context.State[0], Context.State[1], Context.State[2], Context.State[3],
Context.State[4], Context.State[5], Context.State[6], Context.State[7]);
__stosb(cbMillion, 0, 1000000);
SHA256_Init(&Context);
SHA256_Update(&Context, cbMillion, 1000000);
SHA256_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\'\\0\'×1000000\n"
L"\td29751f2 649b32ff 572b5e0a 9f541ea6 60a50f94 ff0beedf b0b692b9 24cc8025\n"
L"\t%08lX %08lX %08lX %08lX %08lX %08lX %08lX %08lX\n",
Context.State[0], Context.State[1], Context.State[2], Context.State[3],
Context.State[4], Context.State[5], Context.State[6], Context.State[7]);
PrintConsole(hConsole, L"\nTiming SHA-256 on %.48hs:\n", dwCPUID);
#ifdef CYCLES
if (!QueryThreadCycleTime(hThread, qwThread))
PrintConsole(hConsole,
L"QueryThreadCycleTime() returned error %lu\n",
dwError = GetLastError());
else
{
do
SHA256_Core(&Context);
while (--dwThread);
if (!QueryThreadCycleTime(hThread, qwThread + 1))
PrintConsole(hConsole,
L"QueryThreadCycleTime() returned error %lu\n",
dwError = GetLastError());
else
PrintConsole(hConsole,
L"%lu.%09lu clock cycles per byte\n",
__edivmodu(qwThread[1] - qwThread[0], 1000000000));
}
#else
if (!GetThreadTimes(hThread, (LPFILETIME) qwThread + 1, (LPFILETIME) qwThread + 1, (LPFILETIME) qwThread + 1, (LPFILETIME) qwThread))
PrintConsole(hConsole,
L"QueryThreadTimes() returned error %lu\n",
dwError = GetLastError());
else
{
do
SHA256_Core(&Context);
while (--dwThread);
if (!GetThreadTimes(hThread, (LPFILETIME) qwThread, (LPFILETIME) qwThread, (LPFILETIME) qwThread, (LPFILETIME) qwThread + 1))
PrintConsole(hConsole,
L"QueryThreadTimes() returned error %lu\n",
dwError = GetLastError());
else
PrintConsole(hConsole,
L"%lu.%07lu nano-seconds per byte\n",
__edivmodu(qwThread[1] - qwThread[0], 10000000));
}
#endif // CYCLES
}
ExitProcess(dwError);
}
Save the
ANSI C
source presented above as sha-256.c
next to the object
file sha-256.obj
assembled before, then run the
following 4 command lines to build the 32-bit console application
sha-256.exe
and execute it:
SET CL=/GAFS- /Gs1049600 /Oxy /W4 /Zl SET LINK=/ENTRY:wmainCRTStartup /NODEFAULTLIB /STACK:2097152,1048576 /SUBSYSTEM:CONSOLE CL.EXE /DCYCLES /Fosha-256.tmp sha-256.c sha-256.obj kernel32.lib user32.lib .\sha-256.exeFor details and reference see the MSDN articles Compiler Options and Linker Options.
Note: if necessary, see the MSDN article Use the Microsoft C++ toolset from the command line for an introduction.
Note: the command lines can be copied and pasted as block into a Command Processor window.
Microsoft (R) 32-bit C/C++ Optimizing Compiler Version 16.00.40219.01 for 80x86 Copyright (C) Microsoft Corporation. All rights reserved. sha-256.c Microsoft (R) Incremental Linker Version 10.00.40219.386 Copyright (C) Microsoft Corporation. All rights reserved. /ENTRY:mainCRTStartup /NODEFAULTLIB /STACK:2097152,1048576 /SUBSYSTEM:CONSOLE /out:sha-256.exe sha-256.tmp sha-256.obj kernel32.lib user32.lib Testing SHA-256 implementation... "" e3b0c442 98fc1c14 9afbf4c8 996fb924 27ae41e4 649b934c a495991b 7852b855 E3B0C442 98FC1C14 9AFBF4C8 996FB924 27AE41E4 649B934C A495991B 7852B855 "abc" ba7816bf 8f01cfea 414140de 5dae2223 b00361a3 96177a9c b410ff61 f20015ad BA7816BF 8F01CFEA 414140DE 5DAE2223 B00361A3 96177A9C B410FF61 F20015AD "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq" 248d6a61 d20638b8 e5c02693 0c3e6039 a33ce459 64ff2167 f6ecedd4 19db06c1 248D6A61 D20638B8 E5C02693 0C3E6039 A33CE459 64FF2167 F6ECEDD4 19DB06C1 "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu" cf5b16a7 78af8380 036ce59e 7b049237 0b249b11 e8f07a51 afac4503 7afee9d1 CF5B16A7 78AF8380 036CE59E 7B049237 0B249B11 E8F07A51 AFAC4503 7AFEE9D1 "\xBD" 68325720 aabd7c82 f30f554b 313d0570 c95accbb 7dc4b5aa e11204c0 8ffe732b 68325720 AABD7C82 F30F554B 313D0570 C95ACCBB 7DC4B5AA E11204C0 8FFE732B "\xC9\x8C\x8E\x55" 7abc22c0 ae5af26c e93dbb94 433a0e0b 2e119d01 4f8e7f65 bd56c61c cccd9504 7ABC22C0 AE5AF26C E93DBB94 433A0E0B 2E119D01 4F8E7F65 BD56C61C CCCD9504 '\0'×55 02779466 cdec1638 11d07881 5c633f21 90141308 1449002f 24aa3e80 f0b88ef7 02779466 CDEC1638 11D07881 5C633F21 90141308 1449002F 24AA3E80 F0B88EF7 '\0'×56 d4817aa5 497628e7 c77e6b60 6107042b bba31308 88c5f47a 375e6179 be789fbb D4817AA5 497628E7 C77E6B60 6107042B BBA31308 88C5F47A 375E6179 BE789FBB '\0'×57 65a16cb7 861335d5 ace3c607 18b5052e 44660726 da4cd13b b745381b 235a1785 65A16CB7 861335D5 ACE3C607 18B5052E 44660726 DA4CD13B B745381B 235A1785 '\0'×64 f5a5fd42 d16a2030 2798ef6e d309979b 43003d23 20d9f0e8 ea9831a9 2759fb4b F5A5FD42 D16A2030 2798EF6E D309979B 43003D23 20D9F0E8 EA9831A9 2759FB4B '\0'×1000 541b3e9d aa09b20b f85fa273 e5cbd3e8 0185aa4e c298e765 db87742b 70138a53 541B3E9D AA09B20B F85FA273 E5CBD3E8 0185AA4E C298E765 DB87742B 70138A53 "A…A" c2e68682 3489ced2 017f6059 b8b23931 8b6364f6 dcd835d0 a519105a 1eadd6e4 C2E68682 3489CED2 017F6059 B8B23931 8B6364F6 DCD835D0 A519105A 1EADD6E4 "U…U" f4d62dde c0f3dd90 ea1380fa 16a5ff8d c4c54b21 740650f2 4afc4120 903552b0 F4D62DDE C0F3DD90 EA1380FA 16A5FF8D C4C54B21 740650F2 4AFC4120 903552B0 "a…a" cdc76e5c 9914fb92 81a1c7e2 84d73e67 f1809a48 a497200e 046d39cc c7112cd0 CDC76E5C 9914FB92 81A1C7E2 84D73E67 F1809A48 A497200E 046D39CC C7112CD0 '\0'×1000000 d29751f2 649b32ff 572b5e0a 9f541ea6 60a50f94 ff0beedf b0b692b9 24cc8025 D29751F2 649B32FF 572B5E0A 9F541EA6 60A50F94 FF0BEEDF B0B692B9 24CC8025 Timing SHA-256 on Intel(R) Core(TM)2 Duo CPU P8700 @ 2.53GHz: 17.395451136 clock cycles per byteIn other units: 7.382697 seconds per GiB, 145.4 MB per second, or 2.69 instructions per clock cycle.
On a newer processor, running at 3.4 GHz:
[…] Timing SHA-256 on AMD Ryzen 7 5700X 8-Core Processor : 10.739818952 clock cycles per byteAlso in other units: 2.941834 seconds per GiB, 316.6 MB per second, or 4.35 instructions per clock cycle.
Execution of the 32-bit console application sha-256.exe
on several newer Intel processors yields the following
results:
[…] Timing SHA-256 on 11th Gen Intel(R) Core(TM) i5-1135G7 @ 2.40GHz: 12.717682821 clock cycles per byte3.67 instructions per clock cycle.
[…] Timing SHA-256 on 11th Gen Intel(R) Core(TM) i5-11400T @ 1.30GHz: 6.518409086 clock cycles per byte7.17 instructions per clock cycle.
[…] Timing SHA-256 on 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz: 9.211634211 clock cycles per byte5.07 instructions per clock cycle.
[…] Timing SHA-256 on 11th Gen Intel(R) Core(TM) i7-11700 @ 2.50GHz: 7.834806972 clock cycles per byte5.96 instructions per clock cycle.
[…] Timing SHA-256 on 11th Gen Intel(R) Core(TM) i7-11700K @ 3.60GHz: 10.826128714 clock cycles per byte4.32 instructions per clock cycle.
[…] Timing SHA-256 on 12th Gen Intel(R) Core(TM) i5-12400F: 5.324265424 clock cycles per byte8.77 instructions per clock cycle.
[…] Timing SHA-256 on 12th Gen Intel(R) Core(TM) i7-12700K: 6.391453740 clock cycles per byte7.31 instructions per clock cycle.
[…] Timing SHA-256 on 13th Gen Intel(R) Core(TM) i5-1335U: 10.023694620 clock cycles per byte4.66 instructions per clock cycle.
; Copyright © 2004-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>
; Microsoft calling convention for AMD64 platform:
; - first 4 arguments (from left to right) are passed in registers
; RCX/R1 or XMM0, RDX/R2 or XMM1, R8 or XMM2, and R9 or XMM3,
; depending on their type (for floating-point arguments of
; unprototyped or variadic functions, where argument type
; expected by callee is unknown, both registers are used);
; - arguments larger than 8 bytes are passed by reference;
; - surplus arguments are pushed on stack in reverse order (from
; right to left), 8-byte aligned;
; - caller allocates memory for return value larger than 8 bytes and
; passes pointer to it as (hidden) first argument, thus shifting
; all other arguments;
; - caller always allocates "home space" for 4 arguments on stack, even
; when less than 4 arguments are passed, but does not need to push
; first 4 arguments;
; - callee can spill first 4 arguments from registers to "home space";
; - callee can clobber "home space";
; - stack is 16-byte aligned: callee must decrement RSP by 8+n*16 bytes
; when it calls other functions (CALL instruction pushes 8 bytes);
; - integer or pointer result is returned in register RAX/R0;
; - floating-point result is returned in register XMM0;
; - registers RAX/R0, RCX/R1, RDX/R2, R8, R9, R10, R11 and XMM0 to XMM5
; are volatile and can be clobbered;
; - registers RBX/R3, RSP/R4, RBP/R5, RSI/R6, RDI/R7, R12, R13, R14,
; R15 and XMM6 to XMM15 must be preserved.
core textequ <SHA256_Core>
final textequ <SHA256_Final>
init textequ <SHA256_Init>
update textequ <SHA256_Update>
context struct 8 ; SHA256_CTX structure
state dword 8 dup (?)
count qword ?
block dword 16 dup (?) ; also "message schedule" W'[16]
context ends
; in order to fold the message schedule W[64] to W'[16] alias block[16],
; W[i] becomes W'[i % 16] alias block[i & 15]
; in order to rotate the 8 working variables a to h through R8D to R15D,
; a becomes R((64 - t) % 8 + 8)D alias R(8 - t & 7)D,
; b becomes R((64 + 'b' - 'a' - t) % 8 + 8)D alias R(9 - t & 7)D,
; etc.
state macro s :req
exitm @CatStr(<r>, %((&s) mod 8 + 8), <d>)
endm
sha256 macro k :req, t :req
.erre 4 and (opattr &k) and (opattr &t)
.errnz -64 and &t
if &t lt 16
mov eax, [rbp+4*&t]
bswap eax ;; eax = ntohl(W[t])
else ; &t ge 16
mov eax, [rbp+(&t-15) mod 16 shl 2]
mov ebx, [rbp+(&t-2) mod 16 shl 2]
mov ecx, eax ;; ecx = m = W[t - 15]
mov edx, ebx ;; edx = n = W[t - 2]
shr eax, 3 ;; eax = m >> 3
shr ebx, 10 ;; ebx = n >> 10
ror ecx, 7 ;; ecx = ROTR(m, 7)
ror edx, 17 ;; edx = ROTR(n, 17)
xor eax, ecx ;; eax = (m >> 3) ^ ROTR(m, 7)
xor ebx, edx ;; ebx = (n >> 10) ^ ROTR(n, 17)
ror ecx, 18-7 ;; ecx = ROTR(m, 18)
ror edx, 19-17 ;; edx = ROTR(n, 19)
xor eax, ecx ;; eax = (m >> 3) ^ ROTR(m, 7) ^ ROTR(m, 18)
;; = SMALL_0(m)
xor ebx, edx ;; ebx = (n >> 10) ^ ROTR(n, 17) ^ ROTR(n, 19)
;; = SMALL_1(n)
add eax, [rbp+(&t-16) mod 16 shl 2]
add ebx, [rbp+(&t-7) mod 16 shl 2]
add eax, ebx ;; eax = SMALL_0(W[t - 15]) + W[t - 16]
;; + SMALL_1(W[t - 2]) + W[t - 7]
endif ; &t ge 16
mov [rbp+(&t) mod 16 shl 2], eax
;; W[t] = (t < 16) ? ntohl(W[t])
;; : SMALL_0(W[t - 15]) + W[t - 16]
;; + SMALL_1(W[t - 2]) + W[t - 7]
add eax, &k ;; eax = W[t] + k[t]
mov ecx, state(68-&t) ;; ecx = e
mov edx, state(68-&t) ;; edx = e
add eax, state(71-&t) ;; eax = W[t] + k[t] + h
ror ecx, 6 ;; ecx = ROTR(e, 6)
mov ebx, state(70-&t) ;; ebx = g
ror edx, 11 ;; edx = ROTR(e, 11)
xor ebx, state(69-&t) ;; ebx = g ^ f
xor ecx, edx ;; ecx = ROTR(e, 6) ^ ROTR(e, 11)
and ebx, state(68-&t) ;; ebx = (g ^ f) & e
ror edx, 25-11 ;; edx = ROTR(e, 25)
xor ebx, state(70-&t) ;; ebx = (g ^ f) & e ^ g
;; = CH(e, f, g)
xor ecx, edx ;; ecx = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25)
;; = SIGMA_1(e)
add eax, ebx ;; eax = W[t] + k[t] + h + CH(e, f, g)
add eax, ecx ;; eax = W[t] + k[t] + h + CH(e, f, g) + SIGMA_1(e)
;; = T1
add state(67-&t), eax ;; d' = d + T1
mov state(71-&t), eax ;; h' = T1
mov eax, state(64-&t) ;; eax = a
mov ebx, state(65-&t) ;; ebx = b
mov ecx, state(64-&t) ;; ecx = a
mov edx, state(64-&t) ;; edx = a
ror ecx, 2 ;; ecx = ROTR(a, 2)
xor eax, state(65-&t) ;; eax = a ^ b
ror edx, 13 ;; edx = ROTR(a, 13)
and ebx, state(64-&t) ;; ebx = a & b
xor ecx, edx ;; ecx = ROTR(a, 2) ^ ROTR(a, 13)
and eax, state(66-&t) ;; eax = (a ^ b) & c
ror edx, 22-13 ;; edx = ROTR(a, 22)
or eax, ebx ;; eax = (a & b) | ((a ^ b) & c)
;; = MAJ(a, b, c)
xor ecx, edx ;; ecx = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22)
;; = SIGMA_0(a)
add eax, ecx ;; eax = T2
add state(63-&t), eax ;; h" = T1 + T2
endm
.code
core proc public ; void SHA256_Core(SHA256_CTX *context)
push rbp
push rbx
push r12
push r13
push r14
push r15
lea rbp, context.block[rcx] ; rbp = address of block
mov r8d, context.state[rcx] ; load working variables from state
mov r9d, context.state[rcx+4]
mov r10d, context.state[rcx+8]
mov r11d, context.state[rcx+12]
mov r12d, context.state[rcx+16]
mov r13d, context.state[rcx+20]
mov r14d, context.state[rcx+24]
mov r15d, context.state[rcx+28]
; calculate 64 rounds
sha256 0428A2F98h, 0
sha256 071374491h, 1
sha256 0B5C0FBCFh, 2
sha256 0E9B5DBA5h, 3
sha256 03956C25Bh, 4
sha256 059F111F1h, 5
sha256 0923F82A4h, 6
sha256 0AB1C5ED5h, 7
sha256 0D807AA98h, 8
sha256 012835B01h, 9
sha256 0243185BEh, 10
sha256 0550C7DC3h, 11
sha256 072BE5D74h, 12
sha256 080DEB1FEh, 13
sha256 09BDC06A7h, 14
sha256 0C19BF174h, 15
sha256 0E49B69C1h, 16
sha256 0EFBE4786h, 17
sha256 00FC19DC6h, 18
sha256 0240CA1CCh, 19
sha256 02DE92C6Fh, 20
sha256 04A7484AAh, 21
sha256 05CB0A9DCh, 22
sha256 076F988DAh, 23
sha256 0983E5152h, 24
sha256 0A831C66Dh, 25
sha256 0B00327C8h, 26
sha256 0BF597FC7h, 27
sha256 0C6E00BF3h, 28
sha256 0D5A79147h, 29
sha256 006CA6351h, 30
sha256 014292967h, 31
sha256 027B70A85h, 32
sha256 02E1B2138h, 33
sha256 04D2C6DFCh, 34
sha256 053380D13h, 35
sha256 0650A7354h, 36
sha256 0766A0ABBh, 37
sha256 081C2C92Eh, 38
sha256 092722C85h, 39
sha256 0A2BFE8A1h, 40
sha256 0A81A664Bh, 41
sha256 0C24B8B70h, 42
sha256 0C76C51A3h, 43
sha256 0D192E819h, 44
sha256 0D6990624h, 45
sha256 0F40E3585h, 46
sha256 0106AA070h, 47
sha256 019A4C116h, 48
sha256 01E376C08h, 49
sha256 02748774Ch, 50
sha256 034B0BCB5h, 51
sha256 0391C0CB3h, 52
sha256 04ED8AA4Ah, 53
sha256 05B9CCA4Fh, 54
sha256 0682E6FF3h, 55
sha256 0748F82EEh, 56
sha256 078A5636Fh, 57
sha256 084C87814h, 58
sha256 08CC70208h, 59
sha256 090BEFFFAh, 60
sha256 0A4506CEBh, 61
sha256 0BEF9A3F7h, 62
sha256 0C67178F2h, 63
; add working variables to state
add context.state[rbp-context.block], r8d
add context.state[rbp+4-context.block], r9d
add context.state[rbp+8-context.block], r10d
add context.state[rbp+12-context.block], r11d
add context.state[rbp+16-context.block], r12d
add context.state[rbp+20-context.block], r13d
add context.state[rbp+24-context.block], r14d
add context.state[rbp+28-context.block], r15d
pop r15
pop r14
pop r13
pop r12
pop rbx
pop rbp
ret
core endp
final proc public ; void SHA256_Final(SHA256_CTX *context,
; unsigned char digest[32])
mov r9, rcx ; r9 = address of context structure
mov rcx, context.count[r9] ; rcx = count
and ecx, sizeof context.block - 1
; ecx = number of bytes in block
; = index of first free byte in block
push rdi
lea rdi, context.block[r9+rcx]
mov r8, rdi ; r8 = address of first free byte in block
pad_1:
mov al, 10000000y
stosb ; block[index] = 0b10000000
pad_0:
xor eax, eax ; rax = 0
xor ecx, sizeof context.block - 1
; ecx = number of free bytes in block - 1
; = 63 - index
rep stosb ; block[index + 1, 63] = 0
sub rdi, sizeof context.count
; rdi = address of last qword in block
cmp r8, rdi
jb short pad_count ; space for count available in block?
; index < 56?
mov rcx, r9 ; rcx = address of context structure
push r9
push r8
push rdx
push rax
sub rsp, 32 ; "home space"
call core
add rsp, 32
pop rax
pop rdx
pop r8
pop r9
pad_block:
mov rcx, rdi ; rcx = address of last qword in block
lea rdi, context.block[r9] ; rdi = address of block
;; xor eax, eax ; rax = 0
sub rcx, rdi ; rcx = number of bytes before last qword
; = 56
rep stosb ; block[0, 55] = 0,
; rdi = address of last qword in block
pad_count:
mov rax, context.count[r9] ; rax = count
shl rax, 3 ; rax = count * 8
; = number of message bits
bswap rax ; rax = htonll(number of message bits)
stosq ; block[56, 63] = htonll(number of message bits)
mov rcx, r9 ; rcx = address of context structure
push r9
push rdx
sub rsp, 32 ; "home space"
call core
add rsp, 32
pop rdi ; rdi = address of digest
mov r9, rsi
pop rsi ; rsi = address of state
mov ecx, lengthof context.state
digest:
lodsd
ifndef ALIGNED
rol eax, 8
stosb
rol eax, 8
stosb
rol eax, 8
stosb
rol eax, 8
stosb
else ; ALIGNED
bswap eax
stosd
endif ; ALIGNED
dec ecx
jnz short digest
mov rsi, r9
pop rdi
ret
final endp
init proc public ; void SHA256_Init(SHA256_CTX *context)
xchg rdi, rcx ; rdi = address of context structure
if 0
mov eax, 06A09E667h ; eax = H0
stosd ; state[0] = H0
mov eax, 0BB67AE85h ; eax = H1
stosd ; state[1] = H1
mov eax, 03C6EF372h ; eax = H2
stosd ; state[2] = H2
mov eax, 0A54FF53Ah ; eax = H3
stosd ; state[3] = H3
mov eax, 0510E527Fh ; eax = H4
stosd ; state[4] = H4
mov eax, 09B05688Ch ; eax = H5
stosd ; state[5] = H5
mov eax, 01F83D9ABh ; eax = H6
stosd ; state[6] = H6
mov eax, 05BE0CD19h ; eax = H7
stosd ; state[7] = H7
else
mov rax, 0BB67AE856A09E667h ; rax = H1 << 32 | H0
stosq ; state[0] = H0,
; state[1] = H1
mov rax, 0A54FF53A3C6EF372h ; rax = H3 << 32 | H2
stosq ; state[2] = H2,
; state[3] = H3
mov rax, 09B05688C510E527Fh ; rax = H5 << 32 | H4
stosq ; state[4] = H4,
; state[5] = H5
mov rax, 05BE0CD191F83D9ABh ; rax = H7 << 32 | H6
stosq ; state[6] = H6,
; state[7] = H7
endif
xor eax, eax ; rax = 0
stosq ; count = 0
mov rdi, rcx
ret
init endp
update proc public ; void SHA256_Update(SHA256_CTX *context,
; void const *data,
; unsigned int size)
test r8, r8
jz short none ; no data?
mov r9, rcx ; r9 = address of context structure
mov rcx, context.count[r9] ; rcx = count
and ecx, sizeof context.block - 1
; ecx = number of bytes in block
; = index of first free byte in block
add context.count[r9], r8 ; count += number of bytes in data
push rsi
mov rsi, rdx ; rsi = address of data
push rdi
data:
lea rdi, context.block[r9+rcx]
; rdi = address of first free byte in block
xor ecx, sizeof context.block - 1
inc ecx ; rcx = number of free bytes in block
sub r8, rcx ; r8 = number of bytes in data
; - number of free bytes in block
jb short last ; number of bytes in data < number of free bytes in block?
more:
rep movsb ; rsi = address of remaining data
mov rdi, r9
mov rcx, r9 ; rcx = address of context structure
push r8
sub rsp, 32 ; "home space"
call core
add rsp, 32
pop r8
mov r9, rdi
xor ecx, ecx ; rcx = 0 = index of first free byte in block
test r8, r8
jnz short data ; more data?
pop rdi
pop rsi
none:
ret
last:
add rcx, r8 ; rcx = number of bytes in data
rep movsb
pop rdi
pop rsi
ret
update endp
end
Note: the function SHA256_Core()
has
2990 instructions in 8232 bytes.
sha-256.asm
with the
AMD64 assembler source presented above, then execute
the following 6 command lines to assemble the 64-bit object file
sha-256.obj
, build the 64-bit console application
sha-256.exe
and execute it:
SET ML=/c /W3 /X ML64.EXE /DALIGNED sha-256.asm SET CL=/GAFS- /Gs1049600 /Oxy /W4 /Zl SET LINK=/ENTRY:wmainCRTStartup /NODEFAULTLIB /STACK:2097152,1048576 /SUBSYSTEM:CONSOLE CL.EXE /DCYCLES /Fosha-256.tmp sha-256.c sha-256.obj kernel32.lib user32.lib .\sha-256.exe
Microsoft (R) Macro Assembler Version (x64) 10.00.40219.01 Copyright (C) Microsoft Corporation. All rights reserved. Assembling: sha-256.asm Microsoft (R) C/C++ Optimizing Compiler Version 16.00.40219.01 for x64 Copyright (C) Microsoft Corporation. All rights reserved. sha-256.c Microsoft (R) Incremental Linker Version 10.00.40219.386 Copyright (C) Microsoft Corporation. All rights reserved. /ENTRY:mainCRTStartup /NODEFAULTLIB /STACK:2097152,1048576 /SUBSYSTEM:CONSOLE /out:sha-256.exe sha-256.tmp sha-256.obj kernel32.lib user32.lib Testing SHA-256 implementation... "" e3b0c442 98fc1c14 9afbf4c8 996fb924 27ae41e4 649b934c a495991b 7852b855 E3B0C442 98FC1C14 9AFBF4C8 996FB924 27AE41E4 649B934C A495991B 7852B855 "abc" ba7816bf 8f01cfea 414140de 5dae2223 b00361a3 96177a9c b410ff61 f20015ad BA7816BF 8F01CFEA 414140DE 5DAE2223 B00361A3 96177A9C B410FF61 F20015AD "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq" 248d6a61 d20638b8 e5c02693 0c3e6039 a33ce459 64ff2167 f6ecedd4 19db06c1 248D6A61 D20638B8 E5C02693 0C3E6039 A33CE459 64FF2167 F6ECEDD4 19DB06C1 "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu" cf5b16a7 78af8380 036ce59e 7b049237 0b249b11 e8f07a51 afac4503 7afee9d1 CF5B16A7 78AF8380 036CE59E 7B049237 0B249B11 E8F07A51 AFAC4503 7AFEE9D1 "\xBD" 68325720 aabd7c82 f30f554b 313d0570 c95accbb 7dc4b5aa e11204c0 8ffe732b 68325720 AABD7C82 F30F554B 313D0570 C95ACCBB 7DC4B5AA E11204C0 8FFE732B "\xC9\x8C\x8E\x55" 7abc22c0 ae5af26c e93dbb94 433a0e0b 2e119d01 4f8e7f65 bd56c61c cccd9504 7ABC22C0 AE5AF26C E93DBB94 433A0E0B 2E119D01 4F8E7F65 BD56C61C CCCD9504 '\0'×55 02779466 cdec1638 11d07881 5c633f21 90141308 1449002f 24aa3e80 f0b88ef7 02779466 CDEC1638 11D07881 5C633F21 90141308 1449002F 24AA3E80 F0B88EF7 '\0'×56 d4817aa5 497628e7 c77e6b60 6107042b bba31308 88c5f47a 375e6179 be789fbb D4817AA5 497628E7 C77E6B60 6107042B BBA31308 88C5F47A 375E6179 BE789FBB '\0'×57 65a16cb7 861335d5 ace3c607 18b5052e 44660726 da4cd13b b745381b 235a1785 65A16CB7 861335D5 ACE3C607 18B5052E 44660726 DA4CD13B B745381B 235A1785 '\0'×64 f5a5fd42 d16a2030 2798ef6e d309979b 43003d23 20d9f0e8 ea9831a9 2759fb4b F5A5FD42 D16A2030 2798EF6E D309979B 43003D23 20D9F0E8 EA9831A9 2759FB4B '\0'×1000 541b3e9d aa09b20b f85fa273 e5cbd3e8 0185aa4e c298e765 db87742b 70138a53 541B3E9D AA09B20B F85FA273 E5CBD3E8 0185AA4E C298E765 DB87742B 70138A53 "A…A" c2e68682 3489ced2 017f6059 b8b23931 8b6364f6 dcd835d0 a519105a 1eadd6e4 C2E68682 3489CED2 017F6059 B8B23931 8B6364F6 DCD835D0 A519105A 1EADD6E4 "U…U" f4d62dde c0f3dd90 ea1380fa 16a5ff8d c4c54b21 740650f2 4afc4120 903552b0 F4D62DDE C0F3DD90 EA1380FA 16A5FF8D C4C54B21 740650F2 4AFC4120 903552B0 "a…a" cdc76e5c 9914fb92 81a1c7e2 84d73e67 f1809a48 a497200e 046d39cc c7112cd0 CDC76E5C 9914FB92 81A1C7E2 84D73E67 F1809A48 A497200E 046D39CC C7112CD0 '\0'×1000000 d29751f2 649b32ff 572b5e0a 9f541ea6 60a50f94 ff0beedf b0b692b9 24cc8025 D29751F2 649B32FF 572B5E0A 9F541EA6 60A50F94 FF0BEEDF B0B692B9 24CC8025 Timing SHA-256 on Intel(R) Core(TM)2 Duo CPU P8700 @ 2.53GHz: 16.827606719 clock cycles per byteIn other units: 6.194439 seconds per GiB, 150.3 MB per second, or 2.78 instructions per clock cycle – only 3.4% faster than the 32-bit console application.
On a newer processor, running at 3.4 GHz:
[…] Timing SHA-256 on AMD Ryzen 7 5700X 8-Core Processor : 7.318561846 clock cycles per byteAlso in other units: 2.004689 seconds per GiB, 464.6 MB per second, or 6.38 instructions per clock cycle – 32% faster than the 32-bit console application!
Execution of the 64-bit console application sha-256.exe
on several newer Intel processors yields the following
results:
[…] Timing SHA-256 on 11th Gen Intel(R) Core(TM) i5-1135G7 @ 2.40GHz: 9.104778206 clock cycles per byte5.13 instructions per clock cycle – 28% faster than the 32-bit console application!
[…] Timing SHA-256 on 11th Gen Intel(R) Core(TM) i5-11400T @ 1.30GHz: 5.535540612 clock cycles per byte8.44 instructions per clock cycle – 15% faster than the 32-bit console application!
[…] Timing SHA-256 on 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz: 8.991402156 clock cycles per byte5.20 instructions per clock cycle – just 2.4% faster than the 32-bit console application.
[…] Timing SHA-256 on 11th Gen Intel(R) Core(TM) i7-11700 @ 2.50GHz: 7.557294528 clock cycles per byte6.18 instructions per clock cycle – just 3.5% faster than the 32-bit console application.
[…] Timing SHA-256 on 11th Gen Intel(R) Core(TM) i7-11700K @ 3.60GHz: 10.406664638 clock cycles per byte4.49 instructions per clock cycle – just 3.9% faster than the 32-bit console application.
[…] Timing SHA-256 on 12th Gen Intel(R) Core(TM) i5-12400F: 5.208533820 clock cycles per byte8.97 instructions per clock cycle – just 2.2% faster than the 32-bit console application.
[…] Timing SHA-256 on 12th Gen Intel(R) Core(TM) i7-12700K: 6.126580126 clock cycles per byte7.63 instructions per clock cycle – just 4.1% faster than the 32-bit console application.
[…] Timing SHA-256 on 13th Gen Intel(R) Core(TM) i5-1335U: 9.012003232 clock cycles per byte5.18 instructions per clock cycle – 10% faster than the 32-bit console application!
With our implementation a single core of an Intel® Core™i7 processor 2600 with Intel® HT Technology can compute Fast SHA-256 of a large data buffer at the rate of ~11.5 cycles/byte.
// Copyright © 2004-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>
// * The software is provided "as is" without any warranty, neither express
// nor implied.
// * In no event will the author be held liable for any damage(s) arising
// from the use of the software.
// * Redistribution of the software is allowed only in unmodified form.
// * Permission is granted to use the software solely for personal private
// and non-commercial purposes.
// * An individuals use of the software in his or her capacity or function
// as an agent, (independent) contractor, employee, member or officer of
// a business, corporation or organization (commercial or non-commercial)
// does not qualify as personal private and non-commercial purpose.
// * Without written approval from the author the software must not be used
// for a business, for commercial, corporate, governmental, military or
// organizational purposes of any kind, or in a commercial, corporate,
// governmental, military or organizational environment of any kind.
#define ROTR(m, n) (m << 64 - n) ^ (m >> n)
#define SMALL_0(x) (ROTR(x, 1) ^ ROTR(x, 8) ^ (x >> 7))
#define SMALL_1(x) (ROTR(x, 19) ^ ROTR(x, 61) ^ (x >> 6))
#define SIGMA_0(x) (ROTR(x, 28) ^ ROTR(x, 34) ^ ROTR(x, 39))
#define SIGMA_1(x) (ROTR(x, 14) ^ ROTR(x, 18) ^ ROTR(x, 41))
#if 0
#define CH(x, y, z) ((x & y) ^ (~x & z))
#define MAJ(x, y, z) ((x & y) ^ (x & z) ^ (y & z))
#else
#define CH(x, y, z) ((x & (y ^ z)) ^ z)
#define MAJ(x, y, z) ((x & y) ^ ((x ^ y) & z))
#endif
typedef struct _sha512_ctx {
unsigned long long state[8], count[2], block[16];
} sha512_ctx;
static const unsigned long long k[80] = {
0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694,
0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70,
0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30,
0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8,
0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC,
0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B,
0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
};
#include <arpa/inet.h> // for htonl() and ntohl() functions
unsigned long long htonll(unsigned long long h)
{
#if 0
unsigned int i = 0;
unsigned long long n = 0;
do {
n <<= 8;
n |= ((unsigned char *) &h)[i];
} while (++i < 8);
return n;
#elif 0
return ((unsigned char *) &h)[0] * 0x0100000000000000u
| ((unsigned char *) &h)[1] * 0x0001000000000000u
| ((unsigned char *) &h)[2] * 0x0000010000000000u
| ((unsigned char *) &h)[3] * 0x0000000100000000u
| ((unsigned char *) &h)[4] * 0x0000000001000000u
| ((unsigned char *) &h)[5] * 0x0000000000010000u
| ((unsigned char *) &h)[6] * 0x0000000000000100u
| ((unsigned char *) &h)[7];
#else
return htonl(1) == 1 ? h : htonl((unsigned long) (h >> 32))
| htonl((unsigned long) h) * 0x100000000u;
#endif
}
unsigned long long ntohll(unsigned long long n)
{
#if 0
unsigned int i = 0;
unsigned long long h = 0;
do {
h <<= 8;
h |= ((unsigned char *) &n)[i];
} while (++i < 8);
return h;
#elif 0
return ((unsigned char *) &n)[0] * 0x0100000000000000u
| ((unsigned char *) &n)[1] * 0x0001000000000000u
| ((unsigned char *) &n)[2] * 0x0000010000000000u
| ((unsigned char *) &n)[3] * 0x0000000100000000u
| ((unsigned char *) &n)[4] * 0x0000000001000000u
| ((unsigned char *) &n)[5] * 0x0000000000010000u
| ((unsigned char *) &n)[6] * 0x0000000000000100u
| ((unsigned char *) &n)[7];
#else
return ntohl(1) == 1 ? n : ntohl((unsigned long) (n >> 32))
| ntohl((unsigned long) n) * 0x100000000u;
#endif
}
void sha512_core(sha512_ctx *context)
{
unsigned long long a, b, c, d, e, f, g, h, i, j, t, w[80];
t = 0;
do // load block into "message schedule" w
w[t] = ntohll(context->block[t]);
while (++t < 16);
do // expand "message schedule"
w[t] = w[t - 16] + SMALL_0(w[t - 15]) + w[t - 7] + SMALL_1(w[t - 2]);
while (++t < 80);
// load state into "working variables" a to h
a = context->state[0];
b = context->state[1];
c = context->state[2];
d = context->state[3];
e = context->state[4];
f = context->state[5];
g = context->state[6];
h = context->state[7];
t = 0;
do { // scramble "working variables"
j = SIGMA_0(a) + MAJ(a, b, c);
i = SIGMA_1(e) + CH(e, f, g) + h + k[t] + w[t];
h = g; g = f; f = e; e = d + i;
d = c; c = b; b = a; a = i + j;
} while (++t < 80);
// add "working variables" to state
context->state[0] += a;
context->state[1] += b;
context->state[2] += c;
context->state[3] += d;
context->state[4] += e;
context->state[5] += f;
context->state[6] += g;
context->state[7] += h;
}
#include <string.h> // for memcpy() and memset() functions
void sha512_final(sha512_ctx *context, unsigned char digest[64])
{
unsigned int c = context->count[0] & 127;
unsigned char *b = (unsigned char *) context->block + c;
*b++ = 128; // pad block with 0b10000000...
if (c < 112)
memset(b, 0, 111 - c);
else {
memset(b, 0, 127 - c);
sha512_core(context); // process full block
memset(context->block, 0, 112);
}
// set message length in bits and network byte-order
context->block[14] = htonll(context->count[1] << 3
| context->count[0] >> 61);
context->block[15] = htonll(context->count[0] << 3);
sha512_core(context); // process last block
c = 0;
do // convert state to network byte-order
context->state[c] = htonll(context->state[c]);
while (++c < 8);
// write message digest
memcpy(digest, context->state, 64);
}
void sha512_init(sha512_ctx *context)
{
context->count[0] = 0; context->count[1] = 0;
context->state[0] = 0x6A09E667F3BCC908; context->state[1] = 0xBB67AE8584CAA73B;
context->state[2] = 0x3C6EF372FE94F82B; context->state[3] = 0xA54FF53A5F1D36F1;
context->state[4] = 0x510E527FADE682D1; context->state[5] = 0x9B05688C2B3E6C1F;
context->state[6] = 0x1F83D9ABFB41BD6B; context->state[7] = 0x5BE0CD19137E2179;
}
void sha512_update(sha512_ctx *context, unsigned char const *data, unsigned long long size)
{
unsigned int c = context->count[0] & 127, free = 128 - c;
unsigned char *b = (unsigned char *) context->block + c;
context->count[0] += size; // update message length
if (context->count[0] < size)
context->count[1]++;
while (size > 0) { // copy message data into block
if (size < free) {
memcpy(b, data, size);
break;
}
memcpy(b, data, free);
sha512_core(context); // process full block
data += free;
size -= free;
free = 128;
}
}
#include <stdio.h>
#include <time.h>
int main()
{
unsigned char digest[64], million[1000000];
unsigned int n = 1024 * 1024 * 1024 / 128;
clock_t t;
sha512_ctx context;
sha512_init(&context);
sha512_final(&context, digest);
printf("\"\"\n"
"\tcf83e1357eefb8bd f1542850d66d8007 d620e4050b5715dc 83f4a921d36ce9ce\n"
"\t%016llX %016llX %016llX %016llX\n\t%016llX %016llX %016llX %016llX\n"
"\t47d0d13c5d85f2b0 ff8318d2877eec2f 63b931bd47417a81 a538327af927da3e\n",
ntohll(context.state[0]), ntohll(context.state[1]),
ntohll(context.state[2]), ntohll(context.state[3]),
ntohll(context.state[4]), ntohll(context.state[5]),
ntohll(context.state[6]), ntohll(context.state[7]));
sha512_init(&context);
sha512_update(&context, "abc", 3);
sha512_final(&context, digest);
printf("\"abc\"\n"
"\tddaf35a193617aba cc417349ae204131 12e6fa4e89a97ea2 0a9eeee64b55d39a\n"
"\t%016llX %016llX %016llX %016llX\n\t%016llX %016llX %016llX %016llX\n"
"\t2192992a274fc1a8 36ba3c23a3feebbd 454d4423643ce80e 2a9ac94fa54ca49f\n",
ntohll(context.state[0]), ntohll(context.state[1]),
ntohll(context.state[2]), ntohll(context.state[3]),
ntohll(context.state[4]), ntohll(context.state[5]),
ntohll(context.state[6]), ntohll(context.state[7]));
sha512_init(&context);
sha512_update(&context, "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu",
sizeof("abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu") - 1);
sha512_final(&context, digest);
printf("\"abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu\"\n"
"\t8e959b75dae313da 8cf4f72814fc143f 8f7779c6eb9f7fa1 7299aeadb6889018\n"
"\t%016llX %016llX %016llX %016llX\n\t%016llX %016llX %016llX %016llX\n"
"\t501d289e4900f7e4 331b99dec4b5433a c7d329eeb6dd2654 5e96e55b874be909\n",
ntohll(context.state[0]), ntohll(context.state[1]),
ntohll(context.state[2]), ntohll(context.state[3]),
ntohll(context.state[4]), ntohll(context.state[5]),
ntohll(context.state[6]), ntohll(context.state[7]));
memset(million, 0, 1000);
sha512_init(&context);
sha512_update(&context, million, 111);
sha512_final(&context, digest);
printf("\'\\0\'*111\n"
"\t77ddd3a542e530fd 047b8977c657ba6c e72f1492e360b2b2 212cd264e75ec038\n"
"\t%016llX %016llX %016llX %016llX\n\t%016llX %016llX %016llX %016llX\n"
"\t82e4ff0525517ab4 207d14c70c2259ba 88d4d335ee0e7e20 543d22102ab1788c\n",
ntohll(context.state[0]), ntohll(context.state[1]),
ntohll(context.state[2]), ntohll(context.state[3]),
ntohll(context.state[4]), ntohll(context.state[5]),
ntohll(context.state[6]), ntohll(context.state[7]));
sha512_init(&context);
sha512_update(&context, million, 112);
sha512_final(&context, digest);
printf("\'\\0\'*112\n"
"\t2be2e788c8a8adea a9c89a7f78904cac ea6e39297d75e057 3a73c756234534d6\n"
"\t%016llX %016llX %016llX %016llX\n\t%016llX %016llX %016llX %016llX\n"
"\t627ab4156b48a665 7b29ab8beb733340 40ad39ead81446bb 09c70704ec707952\n",
ntohll(context.state[0]), ntohll(context.state[1]),
ntohll(context.state[2]), ntohll(context.state[3]),
ntohll(context.state[4]), ntohll(context.state[5]),
ntohll(context.state[6]), ntohll(context.state[7]));
sha512_init(&context);
sha512_update(&context, million, 113);
sha512_final(&context, digest);
printf("\'\\0\'*113\n"
"\t0e67910bcf0f9ccd e5464c63b9c850a1 2a759227d16b040d 98986d54253f9f34\n"
"\t%016llX %016llX %016llX %016llX\n\t%016llX %016llX %016llX %016llX\n"
"\t322318e56b8feb86 c5fb2270ed87f312 52f7f68493ee7597 43909bd75e4bb544\n",
ntohll(context.state[0]), ntohll(context.state[1]),
ntohll(context.state[2]), ntohll(context.state[3]),
ntohll(context.state[4]), ntohll(context.state[5]),
ntohll(context.state[6]), ntohll(context.state[7]));
sha512_init(&context);
sha512_update(&context, million, 122);
sha512_final(&context, digest);
printf("\'\\0\'*122\n"
"\t4f3f095d015be4a7 a7cc0b8c04da4aa0 9e74351e3a97651f 744c23716ebd9b3e\n"
"\t%016llX %016llX %016llX %016llX\n\t%016llX %016llX %016llX %016llX\n"
"\t822e5077a01baa5c c0ed45b9249e88ab 343d4333539df21e d229da6f4a514e0f\n",
ntohll(context.state[0]), ntohll(context.state[1]),
ntohll(context.state[2]), ntohll(context.state[3]),
ntohll(context.state[4]), ntohll(context.state[5]),
ntohll(context.state[6]), ntohll(context.state[7]));
sha512_init(&context);
sha512_update(&context, million, 1000);
sha512_final(&context, digest);
printf("\'\\0\'*1000\n"
"\tca3dff61bb23477a a6087b27508264a6 f9126ee3a004f53c b8db942ed345f2f2\n"
"\t%016llX %016llX %016llX %016llX\n\t%016llX %016llX %016llX %016llX\n"
"\td229b4b59c859220 a1cf1913f34248e3 803bab650e849a3d 9a709edc09ae4a76\n",
ntohll(context.state[0]), ntohll(context.state[1]),
ntohll(context.state[2]), ntohll(context.state[3]),
ntohll(context.state[4]), ntohll(context.state[5]),
ntohll(context.state[6]), ntohll(context.state[7]));
memset(million, 'A', 1000);
sha512_init(&context);
sha512_update(&context, million, 1000);
sha512_final(&context, digest);
printf("\'A\'*1000\n"
"\t329c52ac62d1fe73 1151f2b895a00475 445ef74f50b979c6 f7bb7cae349328c1\n"
"\t%016llX %016llX %016llX %016llX\n\t%016llX %016llX %016llX %016llX\n"
"\td4cb4f7261a0ab43 f936a24b000651d4 a824fcdd577f211a ef8f806b16afe8af\n",
ntohll(context.state[0]), ntohll(context.state[1]),
ntohll(context.state[2]), ntohll(context.state[3]),
ntohll(context.state[4]), ntohll(context.state[5]),
ntohll(context.state[6]), ntohll(context.state[7]));
memset(million, 'U', 1005);
sha512_init(&context);
sha512_update(&context, million, 1005);
sha512_final(&context, digest);
printf("\'U\'*1005\n"
"\t59f5e54fe299c6a8 764c6b199e44924a 37f59e2b56c3ebad 939b7289210dc8e4\n"
"\t%016llX %016llX %016llX %016llX\n\t%016llX %016llX %016llX %016llX\n"
"\tc21b9720165b0f4d 4374c90f1bf4fb4a 5ace17a116179801 5052893a48c3d161\n",
ntohll(context.state[0]), ntohll(context.state[1]),
ntohll(context.state[2]), ntohll(context.state[3]),
ntohll(context.state[4]), ntohll(context.state[5]),
ntohll(context.state[6]), ntohll(context.state[7]));
memset(million, 'a', 1000000);
sha512_init(&context);
sha512_update(&context, million, 1000000);
sha512_final(&context, digest);
printf("\'a\'*1000000\n"
"\tce044bc9fd43269d 5bbc946cbebc3bb7 11341115cc4abdf2 edbc3ff2c57ad4b1\n"
"\t%016llX %016llX %016llX %016llX\n\t%016llX %016llX %016llX %016llX\n"
"\t5deb699bda257fea 5aef9c6e55fcf4cf 9dc25a8c3ce25f2e fe90908379bff7ed\n",
ntohll(context.state[0]), ntohll(context.state[1]),
ntohll(context.state[2]), ntohll(context.state[3]),
ntohll(context.state[4]), ntohll(context.state[5]),
ntohll(context.state[6]), ntohll(context.state[7]));
memset(million, 0, 1000000);
sha512_init(&context);
sha512_update(&context, million, 1000000);
sha512_final(&context, digest);
printf("\'\\0\'*1000000\n"
"\tce044bc9fd43269d 5bbc946cbebc3bb7 11341115cc4abdf2 edbc3ff2c57ad4b1\n"
"\t%016llX %016llX %016llX %016llX\n\t%016llX %016llX %016llX %016llX\n"
"\t5deb699bda257fea 5aef9c6e55fcf4cf 9dc25a8c3ce25f2e fe90908379bff7ed\n",
ntohll(context.state[0]), ntohll(context.state[1]),
ntohll(context.state[2]), ntohll(context.state[3]),
ntohll(context.state[4]), ntohll(context.state[5]),
ntohll(context.state[6]), ntohll(context.state[7]));
t = clock();
do sha512_core(&context); while (--n);
t = clock() - t;
printf("%lu.%06lu seconds per GiB\n",
t / CLOCKS_PER_SEC, (t % CLOCKS_PER_SEC) * 1000000u / CLOCKS_PER_SEC);
}
Execution of this program on Matt Godbolt’s
compiler explorer
using GCC 13.2.0
x86-64 yields the following output:
[…] "" cf83e1357eefb8bd f1542850d66d8007 d620e4050b5715dc 83f4a921d36ce9ce CF83E1357EEFB8BD F1542850D66D8007 D620E4050B5715DC 83F4A921D36CE9CE 47D0D13C5D85F2B0 FF8318D2877EEC2F 63B931BD47417A81 A538327AF927DA3E 47d0d13c5d85f2b0 ff8318d2877eec2f 63b931bd47417a81 a538327af927da3e "abc" ddaf35a193617aba cc417349ae204131 12e6fa4e89a97ea2 0a9eeee64b55d39a DDAF35A193617ABA CC417349AE204131 12E6FA4E89A97EA2 0A9EEEE64B55D39A 2192992A274FC1A8 36BA3C23A3FEEBBD 454D4423643CE80E 2A9AC94FA54CA49F 2192992a274fc1a8 36ba3c23a3feebbd 454d4423643ce80e 2a9ac94fa54ca49f "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu" 8e959b75dae313da 8cf4f72814fc143f 8f7779c6eb9f7fa1 7299aeadb6889018 8E959B75DAE313DA 8CF4F72814FC143F 8F7779C6EB9F7FA1 7299AEADB6889018 501D289E4900F7E4 331B99DEC4B5433A C7D329EEB6DD2654 5E96E55B874BE909 501d289e4900f7e4 331b99dec4b5433a c7d329eeb6dd2654 5e96e55b874be909 '\0'*111 77ddd3a542e530fd 047b8977c657ba6c e72f1492e360b2b2 212cd264e75ec038 77DDD3A542E530FD 047B8977C657BA6C E72F1492E360B2B2 212CD264E75EC038 82E4FF0525517AB4 207D14C70C2259BA 88D4D335EE0E7E20 543D22102AB1788C 82e4ff0525517ab4 207d14c70c2259ba 88d4d335ee0e7e20 543d22102ab1788c '\0'*112 2be2e788c8a8adea a9c89a7f78904cac ea6e39297d75e057 3a73c756234534d6 2BE2E788C8A8ADEA A9C89A7F78904CAC EA6E39297D75E057 3A73C756234534D6 627AB4156B48A665 7B29AB8BEB733340 40AD39EAD81446BB 09C70704EC707952 627ab4156b48a665 7b29ab8beb733340 40ad39ead81446bb 09c70704ec707952 '\0'*113 0e67910bcf0f9ccd e5464c63b9c850a1 2a759227d16b040d 98986d54253f9f34 0E67910BCF0F9CCD E5464C63B9C850A1 2A759227D16B040D 98986D54253F9F34 322318E56B8FEB86 C5FB2270ED87F312 52F7F68493EE7597 43909BD75E4BB544 322318e56b8feb86 c5fb2270ed87f312 52f7f68493ee7597 43909bd75e4bb544 '\0'*122 4f3f095d015be4a7 a7cc0b8c04da4aa0 9e74351e3a97651f 744c23716ebd9b3e 4F3F095D015BE4A7 A7CC0B8C04DA4AA0 9E74351E3A97651F 744C23716EBD9B3E 822E5077A01BAA5C C0ED45B9249E88AB 343D4333539DF21E D229DA6F4A514E0F 822e5077a01baa5c c0ed45b9249e88ab 343d4333539df21e d229da6f4a514e0f '\0'*1000 ca3dff61bb23477a a6087b27508264a6 f9126ee3a004f53c b8db942ed345f2f2 CA3DFF61BB23477A A6087B27508264A6 F9126EE3A004F53C B8DB942ED345F2F2 D229B4B59C859220 A1CF1913F34248E3 803BAB650E849A3D 9A709EDC09AE4A76 d229b4b59c859220 a1cf1913f34248e3 803bab650e849a3d 9a709edc09ae4a76 'A'*1000 329c52ac62d1fe73 1151f2b895a00475 445ef74f50b979c6 f7bb7cae349328c1 329C52AC62D1FE73 1151F2B895A00475 445EF74F50B979C6 F7BB7CAE349328C1 D4CB4F7261A0AB43 F936A24B000651D4 A824FCDD577F211A EF8F806B16AFE8AF d4cb4f7261a0ab43 f936a24b000651d4 a824fcdd577f211a ef8f806b16afe8af 'U'*1005 59f5e54fe299c6a8 764c6b199e44924a 37f59e2b56c3ebad 939b7289210dc8e4 59F5E54FE299C6A8 764C6B199E44924A 37F59E2B56C3EBAD 939B7289210DC8E4 C21B9720165B0F4D 4374C90F1BF4FB4A 5ACE17A116179801 5052893A48C3D161 c21b9720165b0f4d 4374c90f1bf4fb4a 5ace17a116179801 5052893a48c3d161 'a'*1000000 e718483d0ce76964 4e2e42c7bc15b463 8e1f98b13b204428 5632a803afa973eb E718483D0CE76964 4E2E42C7BC15B463 8E1F98B13B204428 5632A803AFA973EB DE0FF244877EA60A 4CB0432CE577C31B EB009C5C2C49AA2E 4EADB217AD8CC09B de0ff244877ea60a 4cb0432ce577c31b eb009c5c2c49aa2e 4eadb217ad8cc09b '\0'*1000000 ce044bc9fd43269d 5bbc946cbebc3bb7 11341115cc4abdf2 edbc3ff2c57ad4b1 CE044BC9FD43269D 5BBC946CBEBC3BB7 11341115CC4ABDF2 EDBC3FF2C57AD4B1 5DEB699BDA257FEA 5AEF9C6E55FCF4CF 9DC25A8C3CE25F2E FE90908379BFF7ED 5deb699bda257fea 5aef9c6e55fcf4cf 9dc25a8c3ce25f2e fe90908379bff7ed 3.018781 seconds per GiBIn other units: 355.7 MB per second.
# Copyright © 2004-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>
# Unix System V calling convention for AMD64 platform:
# - first 6 floating-point arguments (from left to right) are passed in
# registers XMM0 to XMM5;
# - first 6 integer or pointer arguments (from left to right) are passed
# in registers RDI/R7, RSI/R6, RDX/R2, RCX/R1, R8 and R9
# (R10 is used as static chain pointer in case of nested functions);
# - surplus arguments are pushed on stack in reverse order (from right to
# left), 8-byte aligned;
# - 128-bit integer arguments are passed as pair of 64-bit integer arguments,
# low part before/below high part;
# - 128-bit integer result is returned in registers RAX/R0 (low part) and
# RDX/R2 (high part);
# - 64-bit integer or pointer result is returned in register RAX/R0;
# - 32-bit integer result is returned in register EAX;
# - floating-point result is returned in register XMM0;
# - registers RBX/R3, RSP/R4, RBP/R5, R12 to R15 must be preserved;
# - registers RAX/R0, RCX/R1, RDX/R2, RSI/R6, RDI/R7, R8, R9, R10 (in
# case of normal functions), R11 and XMM0 to XMM15 are volatile and can
# be clobbered;
# - stack is 16-byte aligned: callee must decrement RSP by 8+n*16 bytes
# before calling other functions (CALL instruction pushes 8 bytes);
# - a "red zone" of 128 bytes below the stack pointer can be clobbered.
.ident "Copyright (C) 2004-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>"
.file "sha-512.s"
.arch generic64
.code64
.att_syntax
.struct 0 # SHA512_CTX structure
state: # 8 quad words
.space 8*8
count: # 1 octa word
.space 2*8
block: # 16 quad words
.space 16*8
.altmacro # ??? h: %r%(15-(\t)&7) ??? a: %r%(8-(\t)&7) ???
.macro sha512 k :req, t :req
.if \t < 16
movq 8*\t(%rsi), %rax
bswapq %rax # rax = ntohll(block[t])
.else
movq 8*((\t-15)&15)(%rsi), %rax
movq 8*((\t-2)&15)(%rsi), %rbx
movq %rax, %rcx # rax = m = block[t-15&15]
movq %rbx, %rdx # rbx = n = block[t-2&15]
shrq $7, %rax # rax = m >> 7
shrq $6, %rbx # rbx = n >> 6
rorq $1, %rcx # rcx = ROTR(m, 1)
rorq $19, %rdx # rdx = ROTR(n, 19)
xorq %rcx, %rax # rax = (m >> 7) ^ ROTR(m, 1)
xorq %rdx, %rbx # rbx = (n >> 6) ^ ROTR(n, 19)
rorq $(8-1), %rcx # rcx = ROTR(m, 8)
rorq $(61-19), %rdx # rdx = ROTR(n, 61)
xorq %rcx, %rax # rax = (m >> 7) ^ ROTR(m, 1) ^ ROTR(m, 8)
# = SMALL_0(m)
xorq %rdx, %rbx # rbx = (n >> 6) ^ ROTR(n, 19) ^ ROTR(n, 61)
# = SMALL_1(n)
addq 8*((\t-16)&15)(%rsi), %rax
addq 8*((\t-7)&15)(%rsi), %rbx
addq %rbx, %rax # rax = SMALL_0(block[t-15&15]) + block[t-16&15]
# + SMALL_1(block[t-2&15]) + block[t-7&15]
.endif
movq %rax, 8*((\t)&15)(%rsi) # block[t&15] = (t < 16)
# ? ntohll(block[t])
# : SMALL_0(block[t-15&15]) + block[t-16&15]
# + SMALL_1(block[t-2&15]) + block[t-7&15]
addq 8*((87-\t)&7)(%rdi), %rax
# rax = block[t&15] + state[71-t&7]
# = block[t&15] + h
movq 8*((86-\t)&7)(%rdi), %rbx
# rbx = g = state[70-t&7]
movq 8*((85-\t)&7)(%rdi), %rcx
# rcx = f = state[69-t&7]
movq 8*((84-\t)&7)(%rdi), %rdx
# rdx = e = state[68-t&7]
xorq %rbx, %rcx # rcx = f ^ g
andq %rdx, %rcx # rcx = e & (f ^ g)
xorq %rbx, %rcx # rcx = e & (f ^ g) ^ g
# = CH(e, f, g)
addq %rcx, %rax # rax = block[t&15] + CH(e, f, g) + h
movq \k, %rcx # rcx = k[t]
addq %rcx, %rax # rax = block[t&15] + CH(e, f, g) + h + k[t]
movq %rdx, %rbx # rbx = e
rorq $14, %rdx # rdx = ROTR(e, 14)
rorq $18, %rbx # rbx = ROTR(e, 18)
xorq %rbx, %rdx # rdx = ROTR(e, 14) ^ ROTR(e, 18)
rorq $(41-18), %rbx # rbx = ROTR(e, 41)
xorq %rdx, %rbx # rbx = ROTR(e, 14) ^ ROTR(e, 18) ^ ROTR(e, 41)
# = SIGMA_1(e)
addq %rbx, %rax # rax = SIGMA_1(e) + CH(e, f, g) + h + k[t] + block[t&15]
# = T1
movq 8*((80-\t)&7)(%rdi), %rbx
movq 8*((81-\t)&7)(%rdi), %rcx
# rcx = b
movq %rbx, %rdx # rdx = a
xorq %rcx, %rbx # rbx = a ^ b
andq %rdx, %rcx # rcx = a & b
andq 8*((82-\t)&7)(%rdi), %rbx
# rbx = (a ^ b) & c
addq %rax, 8*((83-\t)&7)(%rdi)
# d' = d + T1
orq %rcx, %rbx # rbx = (a & b) | ((a ^ b) & c)
# = MAJ(a, b, c)
addq %rbx, %rax # rax = T1 + MAJ(a, b, c)
movq %rdx, %rcx # rcx = a
rorq $28, %rdx # rdx = ROTR(a, 28)
rorq $34, %rcx # rcx = ROTR(a, 34)
xorq %rcx, %rdx # rdx = ROTR(a, 28) ^ ROTR(a, 34)
rorq $(39-34), %rcx # rcx = ROTR(a, 39)
xorq %rcx, %rdx # rdx = ROTR(a, 28) ^ ROTR(a, 34) ^ ROTR(a, 39)
# = SIGMA_0(a)
addq %rdx, %rax # rax = T1 + T2
movq %rax, 8*((87-\t)&7)(%rdi)
# h' = T1 + T2
.endm
.text
sha512_core: # void SHA512_Core(SHA512_CTX *context)
pushq %rbx
pushq %r12
pushq %r13
pushq %r14
pushq %r15
# load working variables from state
movq state(%rdi), %r8 # r8 = a
movq state+8(%rdi), %r9 # r9 = b
movq state+16(%rdi), %r10 # r10 = c
movq state+24(%rdi), %r11 # r11 = d
movq state+32(%rdi), %r12 # r12 = e
movq state+40(%rdi), %r13 # r13 = f
movq state+48(%rdi), %r14 # r14 = g
movq state+56(%rdi), %r15 # r15 = h
leaq block(%rdi), %rsi
# calculate 80 rounds
sha512 0x428A2F98D728AE22, 0
sha512 0x7137449123EF65CD, 1
sha512 0xB5C0FBCFEC4D3B2F, 2
sha512 0xE9B5DBA58189DBBC, 3
sha512 0x3956C25BF348B538, 4
sha512 0x59F111F1B605D019, 5
sha512 0x923F82A4AF194F9B, 6
sha512 0xAB1C5ED5DA6D8118, 7
sha512 0xD807AA98A3030242, 8
sha512 0x12835B0145706FBE, 9
sha512 0x243185BE4EE4B28C, 10
sha512 0x550C7DC3D5FFB4E2, 11
sha512 0x72BE5D74F27B896F, 12
sha512 0x80DEB1FE3B1696B1, 13
sha512 0x9BDC06A725C71235, 14
sha512 0xC19BF174CF692694, 15
sha512 0xE49B69C19EF14AD2, 16
sha512 0xEFBE4786384F25E3, 17
sha512 0x0FC19DC68B8CD5B5, 18
sha512 0x240CA1CC77AC9C65, 19
sha512 0x2DE92C6F592B0275, 20
sha512 0x4A7484AA6EA6E483, 21
sha512 0x5CB0A9DCBD41FBD4, 22
sha512 0x76F988DA831153B5, 23
sha512 0x983E5152EE66DFAB, 24
sha512 0xA831C66D2DB43210, 25
sha512 0xB00327C898FB213F, 26
sha512 0xBF597FC7BEEF0EE4, 27
sha512 0xC6E00BF33DA88FC2, 28
sha512 0xD5A79147930AA725, 29
sha512 0x06CA6351E003826F, 30
sha512 0x142929670A0E6E70, 31
sha512 0x27B70A8546D22FFC, 32
sha512 0x2E1B21385C26C926, 33
sha512 0x4D2C6DFC5AC42AED, 34
sha512 0x53380D139D95B3DF, 35
sha512 0x650A73548BAF63DE, 36
sha512 0x766A0ABB3C77B2A8, 37
sha512 0x81C2C92E47EDAEE6, 38
sha512 0x92722C851482353B, 39
sha512 0xA2BFE8A14CF10364, 40
sha512 0xA81A664BBC423001, 41
sha512 0xC24B8B70D0F89791, 42
sha512 0xC76C51A30654BE30, 43
sha512 0xD192E819D6EF5218, 44
sha512 0xD69906245565A910, 45
sha512 0xF40E35855771202A, 46
sha512 0x106AA07032BBD1B8, 47
sha512 0x19A4C116B8D2D0C8, 48
sha512 0x1E376C085141AB53, 49
sha512 0x2748774CDF8EEB99, 50
sha512 0x34B0BCB5E19B48A8, 51
sha512 0x391C0CB3C5C95A63, 52
sha512 0x4ED8AA4AE3418ACB, 53
sha512 0x5B9CCA4F7763E373, 54
sha512 0x682E6FF3D6B2B8A3, 55
sha512 0x748F82EE5DEFB2FC, 56
sha512 0x78A5636F43172F60, 57
sha512 0x84C87814A1F0AB72, 58
sha512 0x8CC702081A6439EC, 59
sha512 0x90BEFFFA23631E28, 60
sha512 0xA4506CEBDE82BDE9, 61
sha512 0xBEF9A3F7B2C67915, 62
sha512 0xC67178F2E372532B, 63
sha512 0xCA273ECEEA26619C, 64
sha512 0xD186B8C721C0C207, 65
sha512 0xEADA7DD6CDE0EB1E, 66
sha512 0xF57D4F7FEE6ED178, 67
sha512 0x06F067AA72176FBA, 68
sha512 0x0A637DC5A2C898A6, 69
sha512 0x113F9804BEF90DAE, 70
sha512 0x1B710B35131C471B, 71
sha512 0x28DB77F523047D84, 72
sha512 0x32CAAB7B40C72493, 73
sha512 0x3C9EBE0A15C9BEBC, 74
sha512 0x431D67C49C100D4C, 75
sha512 0x4CC5D4BECB3E42B6, 76
sha512 0x597F299CFC657E2A, 77
sha512 0x5FCB6FAB3AD6FAEC, 78
sha512 0x6C44198C4A475817, 79
# add working variables to state
addq %r8, state(%rdi)
addq %r9, state+8(%rdi)
addq %r10, state+16(%rdi)
addq %r11, state+24(%rdi)
addq %r12, state+32(%rdi)
addq %r13, state+40(%rdi)
addq %r14, state+48(%rdi)
addq %r15, state+56(%rdi)
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbx
retq
.global sha512_core
.size sha512_core, .-sha512_core
.type sha512_core, @function
sha512_final: # void SHA512_Final(SHA512_CTX *context,
# unsigned char digest[64])
movq %rdi, %rdx # rdx = address of context structure
movq count(%rdi), %rcx # rcx = low quad word of count
andl $127, %ecx # rcx = number of bytes in block
# = index of first free byte in block
leaq block(%rdi, %rcx), %rdi # rdi = address of first free byte in block
movq %rdi, %r11
.Lpad_1:
movb $0b10000000, %al
stosb # block[index] = 0b10000000
.Lpad_0:
xorl %eax, %eax # rax = 0
xorl $127, %ecx # rcx = number of free bytes in block - 1
# = 127 - index
rep stosb # block[index + 1, 127] = 0
subq $16, %rdi # rdi = address of last octa word in block
cmpq %r11, %rdi
ja .Lpad_count # space for count available in block?
# index < 112?
pushq %rsi
pushq %rdi
pushq %rdx
movq %rdx, %rdi # rdi = address of context structure
callq sha256_core
popq %rdx
popq %rdi # rdi = address of last octa word in block
popq %rsi # rsi = address of digest
.Lpad_block:
movq %rdi, %rcx # rcx = address of last octa word in block
leaq block(%rdx), %rdi # rdi = address of block
xorl %eax, %eax # rax = 0
subq %rdi, %rcx # rcx = number of bytes before last octa word
# = 112
rep stosb # block[0, 111] = 0,
# rdi = address of last octa word in block
.Lpad_count:
movq count(%rdx), %rcx
movq count+8(%rdx), %rax # rax:rcx = count
shldq $3, %rcx, %rax
shlq $3, %rcx # rax:rcx = count * 8
# = number of message bits
bswapq %rax
bswapq %rcx # rax:rcx = htonlll(number of message bits)
stosq
movq %rcx, %rax
stosq # block[112, 127] = htonlll(number of message bits)
pushq %rsi
pushq %rdi
pushq %rdx
movq %rdx, %rdi # rdi = address of context structure
callq sha256_core
popq %rsi # rsi = address of state
popq %rax
popq %rdi # rdi = address of digest
movl $8, %ecx # ecx = number of quad words
.Ldigest:
lodsq
.ifdef ALIGNED
bswapq %rax
stosq
.else
rolq $8, %rax
stosb
rolq $8, %rax
stosb
rolq $8, %rax
stosb
rolq $8, %rax
stosb
rolq $8, %rax
stosb
rolq $8, %rax
stosb
rolq $8, %rax
stosb
rolq $8, %rax
stosb
.endif
decl %ecx
jnz .Ldigest
retq
.global sha512_final
.size sha512_final, .-sha512_final
.type sha512_final, @function
sha512_init: # void SHA512_Init(SHA512_CTX *context)
movq 0x6A09E667F3BCC908, %rax
stosq # state[0] = H0
movq 0xBB67AE8584CAA73B, %rax
stosq # state[1] = H1
movq 0x3C6EF372FE94F82B, %rax
stosq # state[2] = H2
movq 0xA54FF53A5F1D36F1, %rax
stosq # state[3] = H3
movq 0x510E527FADE682D1, %rax
stosq # state[4] = H4
movq 0x9B05688C2B3E6C1F, %rax
stosq # state[5] = H5
movq 0x1F83D9ABFB41BD6B, %rax
stosq # state[6] = H6
movq 0x5BE0CD19137E2179, %rax
stosq # state[7] = H7
xorl %eax, %eax
stosq
stosq # count = 0
retq
.global sha512_init
.size sha512_init, .-sha512_init
.type sha512_init, @function
sha512_update: # void SHA512_Update(SHA512_CTX *context,
# void const *data,
# unsigned long long size)
testq %rdx, %rdx
jz .Lnone # no data?
movq count(%rdi), %rcx # rcx = low quad word of count
andl $127, %ecx # rcx = number of bytes in block
# = index of first free byte in block
addq %rdx, count(%rdi)
adcq $0, count+8(%rdi) # count += number of bytes in data
movq %rdi, %rax # rax = address of context structure
.Ldata:
leaq block(%rax, %rcx), %rdi # rdi = address of first free byte in block
xorl $127, %ecx
incl %ecx # rcx = number of free bytes in block
subq %rcx, %rdx # rdx = number of bytes in data
# - number of free bytes in block
jb .Llast # number of bytes in data < number of free bytes in block?
.Lmore:
rep movsb # rsi = address of remaining data
pushq %rsi
pushq %rdx
pushq %rax
movq %rax, %rdi # rdi = address of context structure
callq sha512_core
popq %rax
popq %rdx
popq %rsi
xorl %ecx, %ecx # rcx = 0 = index of first free byte in block
testq %rdx, %rdx
jnz .Ldata # more data?
.Lnone:
retq
.Llast:
addq %rdx, %rcx # rcx = number of bytes in data
rep movsb
retq
.global sha512_update
.size sha512_update, .-sha512_update
.type sha512_update, @function
.end
#include
the following
ANSI C
header file in your
ANSI C
sources:
// Copyleft © 2004-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>
typedef struct _sha512_ctx {
unsigned long long state[8], count[2], block[16];
} sha512_ctx;
extern void sha512_core(sha512_ctx *context);
extern void sha512_final(sha512_ctx *context, unsigned char digest[64]);
extern void sha512_init(sha512_ctx *context);
extern void sha512_update(sha512_ctx *context, void const *data, unsigned long long size);
; Copyright © 2004-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>
core textequ <SHA512_Core>
final textequ <SHA512_Final>
init textequ <SHA512_Init>
update textequ <SHA512_Update>
context struct 8 ; SHA512_CTX structure
state qword 8 dup (?)
count qword 2 dup (?)
block qword 16 dup (?) ; also "message schedule" W'[16]
context ends
; in order to fold the message schedule W[80] to W'[16] alias block[16],
; W[i] becomes W'[i % 16] alias block[i & 15]
; in order to rotate the 8 working variables a to h through R8 to R15,
; a becomes R((80 - t) % 8 + 8) alias R(8 - t & 7),
; b becomes R((80 + 'b' - 'a' - t) % 8 + 8) alias R(9 - t & 7),
; etc.
; in order to use 8-bit displacements for memory accesses, RBP is loaded
; with the address of the block[] array; block[] has displacements 0 to 120
; from RBP, count[] has displacements -16 and -8 from RBP, and state[] has
; displacements -80 to -24 from RBP: block[i] becomes [rbp+i*8], count[i]
; becomes [rbp-16+i*8], state[i] becomes [rbp-80+i*8]
state macro s :req
exitm @CatStr(<r>, %((&s) mod 8 + 8))
endm
sha512 macro k :req, t :req
.erre 4 and (opattr &k) and (opattr &t)
.erre 0 le &t and &t lt 80
if &t lt 16
mov rax, [rbp+8*&t]
bswap rax ;; rax = ntohll(W[t])
else ; &t ge 16
mov rax, [rbp+(&t-15) mod 16 shl 3]
mov rbx, [rbp+(&t-2) mod 16 shl 3]
mov rcx, rax ;; rcx = m = W[t - 15]
mov rdx, rbx ;; rdx = n = W[t - 2]
shr rax, 7 ;; rax = m >> 7
shr rbx, 6 ;; rbx = n >> 6
ror rcx, 1 ;; rcx = ROTR(m, 1)
ror rdx, 19 ;; rdx = ROTR(n, 19)
xor rax, rcx ;; rax = (m >> 7) ^ ROTR(m, 1)
xor rbx, rdx ;; rbx = (n >> 6) ^ ROTR(n, 19)
ror rcx, 8-1 ;; rcx = ROTR(m, 8)
ror rdx, 61-19 ;; rdx = ROTR(n, 61)
xor rax, rcx ;; rax = (m >> 7) ^ ROTR(m, 1) ^ ROTR(m, 8)
;; = SMALL_0(m)
xor rbx, rdx ;; rbx = (n >> 6) ^ ROTR(n, 19) ^ ROTR(n, 61)
;; = SMALL_1(n)
add rax, [rbp+(&t-16) mod 16 shl 3]
add rbx, [rbp+(&t-7) mod 16 shl 3]
add rax, rbx ;; rax = SMALL_0(W[t - 15]) + W[t - 16]
;; + SMALL_1(W[t - 2]) + W[t - 7]
endif ; &t ge 16
mov [rbp+(&t) mod 16 shl 3], rax
;; W[t] = (t < 16) ? ntohll(W[t])
;; : SMALL_0(W[t - 15]) + W[t - 16]
;; + SMALL_1(W[t - 2]) + W[t - 7]
mov rbx, &k ;; rbx = k[t]
mov rcx, state(84-&t) ;; rcx = e
add rax, rbx ;; rax = W[t] + k[t]
mov rdx, state(84-&t) ;; rdx = e
add rax, state(87-&t) ;; rax = W[t] + k[t] + h
ror rcx, 14 ;; rcx = ROTR(e, 14)
mov rbx, state(86-&t) ;; rbx = g
ror rdx, 18 ;; rdx = ROTR(e, 18)
xor rbx, state(85-&t) ;; rbx = g ^ f
xor rcx, rdx ;; rcx = ROTR(e, 14) ^ ROTR(e, 18)
and rbx, state(84-&t) ;; rbx = (g ^ f) & e
ror rdx, 41-18 ;; rdx = ROTR(e, 41)
xor rbx, state(86-&t) ;; rbx = (g ^ f) & e ^ g
;; = CH(e, f, g)
xor rcx, rdx ;; rcx = ROTR(e, 14) ^ ROTR(e, 18) ^ ROTR(e, 41)
;; = SIGMA_1(e)
add rax, rbx ;; rax = W[t] + k[t] + h + CH(e, f, g)
add rax, rcx ;; rax = W[t] + k[t] + h + CH(e, f, g) + SIGMA_1(e)
;; = T1
add state(83-&t), rax ;; d' = d + T1
mov state(87-&t), rax ;; h' = T1
mov rax, state(80-&t) ;; rax = a
mov rbx, state(81-&t) ;; rbx = b
mov rcx, state(80-&t) ;; rcx = a
mov rdx, state(80-&t) ;; rdx = a
ror rcx, 28 ;; rcx = ROTR(a, 28)
xor rax, state(81-&t) ;; rax = a ^ b
ror rdx, 34 ;; rdx = ROTR(a, 34)
and rbx, state(80-&t) ;; rbx = a & b
xor rcx, rdx ;; rcx = ROTR(a, 28) ^ ROTR(a, 34)
and rax, state(82-&t) ;; rax = (a ^ b) & c
ror rdx, 39-34 ;; rdx = ROTR(a, 39)
or rax, rbx ;; rax = (a & b) | ((a ^ b) & c)
;; = MAJ(a, b, c)
xor rcx, rdx ;; rcx = ROTR(a, 28) ^ ROTR(a, 34) ^ ROTR(a, 39)
;; = SIGMA_0(a)
add rax, rcx ;; rax = T2
add state(87-&t), rax ;; h" = T1 + T2
endm
.code
core proc public ; void SHA512_Core(SHA512_CTX *context)
push rbp
push rbx
push r12
push r13
push r14
push r15
lea rbp, context.block[rcx] ; rbp = address of block
mov r8, context.state[rcx] ; load working variables from state
mov r9, context.state[rcx+8]
mov r10, context.state[rcx+16]
mov r11, context.state[rcx+24]
mov r12, context.state[rcx+32]
mov r13, context.state[rcx+40]
mov r14, context.state[rcx+48]
mov r15, context.state[rcx+56]
; calculate 80 rounds
sha512 0428A2F98D728AE22h, 0
sha512 07137449123EF65CDh, 1
sha512 0B5C0FBCFEC4D3B2Fh, 2
sha512 0E9B5DBA58189DBBCh, 3
sha512 03956C25BF348B538h, 4
sha512 059F111F1B605D019h, 5
sha512 0923F82A4AF194F9Bh, 6
sha512 0AB1C5ED5DA6D8118h, 7
sha512 0D807AA98A3030242h, 8
sha512 012835B0145706FBEh, 9
sha512 0243185BE4EE4B28Ch, 10
sha512 0550C7DC3D5FFB4E2h, 11
sha512 072BE5D74F27B896Fh, 12
sha512 080DEB1FE3B1696B1h, 13
sha512 09BDC06A725C71235h, 14
sha512 0C19BF174CF692694h, 15
sha512 0E49B69C19EF14AD2h, 16
sha512 0EFBE4786384F25E3h, 17
sha512 00FC19DC68B8CD5B5h, 18
sha512 0240CA1CC77AC9C65h, 19
sha512 02DE92C6F592B0275h, 20
sha512 04A7484AA6EA6E483h, 21
sha512 05CB0A9DCBD41FBD4h, 22
sha512 076F988DA831153B5h, 23
sha512 0983E5152EE66DFABh, 24
sha512 0A831C66D2DB43210h, 25
sha512 0B00327C898FB213Fh, 26
sha512 0BF597FC7BEEF0EE4h, 27
sha512 0C6E00BF33DA88FC2h, 28
sha512 0D5A79147930AA725h, 29
sha512 006CA6351E003826Fh, 30
sha512 0142929670A0E6E70h, 31
sha512 027B70A8546D22FFCh, 32
sha512 02E1B21385C26C926h, 33
sha512 04D2C6DFC5AC42AEDh, 34
sha512 053380D139D95B3DFh, 35
sha512 0650A73548BAF63DEh, 36
sha512 0766A0ABB3C77B2A8h, 37
sha512 081C2C92E47EDAEE6h, 38
sha512 092722C851482353Bh, 39
sha512 0A2BFE8A14CF10364h, 40
sha512 0A81A664BBC423001h, 41
sha512 0C24B8B70D0F89791h, 42
sha512 0C76C51A30654BE30h, 43
sha512 0D192E819D6EF5218h, 44
sha512 0D69906245565A910h, 45
sha512 0F40E35855771202Ah, 46
sha512 0106AA07032BBD1B8h, 47
sha512 019A4C116B8D2D0C8h, 48
sha512 01E376C085141AB53h, 49
sha512 02748774CDF8EEB99h, 50
sha512 034B0BCB5E19B48A8h, 51
sha512 0391C0CB3C5C95A63h, 52
sha512 04ED8AA4AE3418ACBh, 53
sha512 05B9CCA4F7763E373h, 54
sha512 0682E6FF3D6B2B8A3h, 55
sha512 0748F82EE5DEFB2FCh, 56
sha512 078A5636F43172F60h, 57
sha512 084C87814A1F0AB72h, 58
sha512 08CC702081A6439ECh, 59
sha512 090BEFFFA23631E28h, 60
sha512 0A4506CEBDE82BDE9h, 61
sha512 0BEF9A3F7B2C67915h, 62
sha512 0C67178F2E372532Bh, 63
sha512 0CA273ECEEA26619Ch, 64
sha512 0D186B8C721C0C207h, 65
sha512 0EADA7DD6CDE0EB1Eh, 66
sha512 0F57D4F7FEE6ED178h, 67
sha512 006F067AA72176FBAh, 68
sha512 00A637DC5A2C898A6h, 69
sha512 0113F9804BEF90DAEh, 70
sha512 01B710B35131C471Bh, 71
sha512 028DB77F523047D84h, 72
sha512 032CAAB7B40C72493h, 73
sha512 03C9EBE0A15C9BEBCh, 74
sha512 0431D67C49C100D4Ch, 75
sha512 04CC5D4BECB3E42B6h, 76
sha512 0597F299CFC657E2Ah, 77
sha512 05FCB6FAB3AD6FAECh, 78
sha512 06C44198C4A475817h, 79
; add working variables to state
add context.state[rbp-context.block], r8
add context.state[rbp+8-context.block], r9
add context.state[rbp+16-context.block], r10
add context.state[rbp+24-context.block], r11
add context.state[rbp+32-context.block], r12
add context.state[rbp+40-context.block], r13
add context.state[rbp+48-context.block], r14
add context.state[rbp+56-context.block], r15
pop r15
pop r14
pop r13
pop r12
pop rbx
pop rbp
ret
core endp
final proc public ; void SHA512_Final(SHA512_CTX *context,
; unsigned char digest[64])
mov r9, rcx ; r9 = address of context structure
mov rcx, context.count[r9] ; rcx = low qword of count
and ecx, sizeof context.block - 1
; ecx = number of bytes in block
; = index of first free byte in block
push rdi
lea rdi, context.block[r9+rcx]
mov r8, rdi ; r8 = address of first free byte in block
pad_1:
mov al, 10000000y
stosb ; block[index] = 0b10000000
pad_0:
xor eax, eax ; rax = 0
xor ecx, sizeof context.block - 1
; ecx = number of free bytes in block - 1
; = 127 - index
rep stosb ; block[index + 1, 127] = 0
sub rdi, sizeof context.count
; rdi = address of last oword in block
cmp r8, rdi
jb short pad_count ; space for count available in block?
; index < 112?
mov rcx, r9 ; rcx = address of context structure
push r9
push r8
push rdx
push rax
sub rsp, 32 ; "home space"
call core
add rsp, 32
pop rax
pop rdx
pop r8
pop r9
pad_block:
mov rcx, rdi ; rcx = address of last oword in block
lea rdi, context.block[r9] ; rdi = address of block
;; xor eax, eax ; rax = 0
sub rcx, rdi ; rcx = number of bytes before last oword
; = 112
rep stosb ; block[0, 111] = 0,
; rdi = address of last oword in block
pad_count:
mov rax, context.count[r9+8]
mov rcx, context.count[r9] ; rax:rcx = count
shld rax, rcx, 3
shl rcx, 3 ; rax:rcx = count * 8
; = number of message bits
bswap rax
bswap rcx ; rax:rcx = htonlll(number of message bits)
stosq
mov rax, rcx
stosq ; block[112, 127] = number of message bits
mov rcx, r9 ; rcx = address of context structure
push r9
push rdx
sub rsp, 32 ; "home space"
call core
add rsp, 32
pop rdi ; rdi = address of digest
mov r9, rsi
pop rsi ; rsi = address of state
mov ecx, lengthof context.state
digest:
lodsq
ifndef ALIGNED
rol rax, 8
stosb
rol rax, 8
stosb
rol rax, 8
stosb
rol rax, 8
stosb
rol rax, 8
stosb
rol rax, 8
stosb
rol rax, 8
stosb
rol rax, 8
stosb
else ; ALIGNED
bswap rax
stosq
endif ; ALIGNED
dec ecx
jnz short digest
mov rsi, r9
pop rdi
ret
final endp
init proc public ; void SHA512_Init(SHA512_CTX *context)
xchg rdi, rcx ; rdi = address of context structure
mov rax, 06A09E667F3BCC908h ; rax = H0
stosq ; state[0] = H0
mov rax, 0BB67AE8584CAA73Bh ; rax = H1
stosq ; state[1] = H1
mov rax, 03C6EF372FE94F82Bh ; rax = H2
stosq ; state[2] = H2
mov rax, 0A54FF53A5F1D36F1h ; rax = H3
stosq ; state[3] = H3
mov rax, 0510E527FADE682D1h ; rax = H4
stosq ; state[4] = H4
mov rax, 09B05688C2B3E6C1Fh ; rax = H5
stosq ; state[5] = H5
mov rax, 01F83D9ABFB41BD6Bh ; rax = H6
stosq ; state[6] = H6
mov rax, 05BE0CD19137E2179h ; rax = H7
stosq ; state[7] = H7
xor eax, eax ; rax = 0
stosq
stosq ; count = 0
mov rdi, rcx
ret
init endp
update proc public ; void SHA512_Update(SHA512_CTX *context,
; void const *data,
; unsigned long long size)
test r8, r8
jz short none ; no data?
mov r9, rcx ; r9 = address of context structure
mov rcx, context.count[r9] ; rcx = low qword of count
and ecx, sizeof context.block - 1
; ecx = number of bytes in block
; = index of first free byte in block
add context.count[r9], r8
adc context.count[r9+8], 0 ; count += number of bytes in data
push rsi
mov rsi, rdx ; rsi = address of data
push rdi
data:
lea rdi, context.block[r9+rcx]
; rdi = address of first free byte in block
xor ecx, sizeof context.block - 1
inc ecx ; rcx = number of free bytes in block
sub r8, rcx ; r8 = number of bytes in data
; - number of free bytes in block
jb short last ; number of bytes in data < number of free bytes in block?
more:
rep movsb ; rsi = address of remaining data
mov rdi, r9
mov rcx, r9 ; rcx = address of context structure
push r8
sub rsp, 32 ; "home space"
call core
add rsp, 32
pop r8
mov r9, rdi
xor ecx, ecx ; rcx = 0 = index of first free byte in block
test r8, r8
jnz short data ; more data?
pop rdi
pop rsi
none:
ret
last:
add rcx, r8 ; rcx = number of bytes in data
rep movsb
pop rdi
pop rsi
ret
update endp
end
Note: the function SHA512_Core()
has
3870 instructions in 13320 bytes.
sha-512.asm
in an arbitrary, preferable empty
directory, then execute the following 2 command lines to generate
the 64-bit object file sha-512.obj
:
SET ML=/c /W3 /X ML64.EXE /DALIGNED sha-512.asmFor details and reference see the MSDN article ML and ML64 Command-Line Reference.
Note: if necessary, see the MSDN article Use the Microsoft C++ toolset from the command line for an introduction.
Note: the command lines can be copied and pasted as block into a Command Processor window.
Microsoft (R) Macro Assembler Version (x64) 10.00.40219.01 Copyright (C) Microsoft Corporation. All rights reserved. Assembling: sha-512.asm
// Copyright © 2004-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>
#define STRICT
#define UNICODE
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
typedef struct _SHA512_CTX
{
DWORD64 State[8];
DWORD64 Count[2];
DWORD64 Block[16];
} SHA512_CTX;
VOID CDECL SHA512_Core(SHA512_CTX *Context);
VOID CDECL SHA512_Final(SHA512_CTX *Context, BYTE Digest[64]);
VOID CDECL SHA512_Init(SHA512_CTX *Context);
VOID CDECL SHA512_Update(SHA512_CTX *Context, LPCVOID Data, DWORD64 Size);
#ifndef _M_IX86
#define __edivmodu(N, D) (DWORD) ((N) / (D)), (DWORD) ((N) % (D))
#else
__forceinline // companion for __emulu()
struct
{
DWORD ulQuotient, ulRemainder;
} CDECL __edivmodu(DWORD64 ullDividend, DWORD ulDivisor)
{
__asm mov eax, dword ptr ullDividend
__asm mov edx, dword ptr ullDividend+4
__asm div ulDivisor
}
#endif // _M_IX86
__declspec(safebuffers)
BOOL CDECL PrintConsole(HANDLE hConsole, [SA_FormatString(Style="printf")] LPCWSTR lpFormat, ...)
{
WCHAR szOutput[1024];
DWORD dwOutput;
DWORD dwConsole;
va_list vaInput;
va_start(vaInput, lpFormat);
dwOutput = wvsprintf(szOutput, lpFormat, vaInput);
va_end(vaInput);
if ((dwOutput == 0)
|| !WriteConsole(hConsole, szOutput, dwOutput, &dwConsole, NULL))
return FALSE;
return dwConsole == dwOutput;
}
__declspec(noreturn)
VOID CDECL wmainCRTStartup(VOID)
{
SHA512_CTX Context;
BYTE cbDigest[64], cbMillion[1000000];
DWORD dwCPUID[12];
DWORD dwError = ERROR_SUCCESS;
DWORD dwThread = 1000000000 / 128;
DWORD64 qwThread[2];
HANDLE hThread = GetCurrentThread();
HANDLE hConsole = GetStdHandle(STD_ERROR_HANDLE);
if (hConsole == INVALID_HANDLE_VALUE)
dwError = GetLastError();
else
{
__cpuid(dwCPUID, 0x80000000);
if (*dwCPUID >= 0x80000004)
{
__cpuid(dwCPUID, 0x80000002);
__cpuid(dwCPUID + 4, 0x80000003);
__cpuid(dwCPUID + 8, 0x80000004);
}
else
__movsb(dwCPUID, "unidentified processor", sizeof("unidentified processor"));
if (SetThreadIdealProcessor(hThread, 0) == -1)
PrintConsole(hConsole,
L"SetThreadIdealProcessor() returned error %lu\n",
dwError = GetLastError());
if (!SetThreadPriority(hThread, THREAD_PRIORITY_HIGHEST))
PrintConsole(hConsole,
L"SetThreadPriority() returned error %lu\n",
dwError = GetLastError());
PrintConsole(hConsole, L"\nTesting SHA-512 implementation...\n");
SHA512_Init(&Context);
SHA512_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"\"\n"
L"\tcf83e1357eefb8bd f1542850d66d8007 d620e4050b5715dc 83f4a921d36ce9ce\n"
L"\t%016I64X %016I64X %016I64X %016I64X\n\t%016I64X %016I64X %016I64X %016I64X\n"
L"\t47d0d13c5d85f2b0 ff8318d2877eec2f 63b931bd47417a81 a538327af927da3e\n",
Context.State[0], Context.State[1], Context.State[2], Context.State[3],
Context.State[4], Context.State[5], Context.State[6], Context.State[7]);
SHA512_Init(&Context);
SHA512_Update(&Context, "abc", 3);
SHA512_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"abc\"\n"
L"\tddaf35a193617aba cc417349ae204131 12e6fa4e89a97ea2 0a9eeee64b55d39a\n"
L"\t%016I64X %016I64X %016I64X %016I64X\n\t%016I64X %016I64X %016I64X %016I64X\n"
L"\t2192992a274fc1a8 36ba3c23a3feebbd 454d4423643ce80e 2a9ac94fa54ca49f\n",
Context.State[0], Context.State[1], Context.State[2], Context.State[3],
Context.State[4], Context.State[5], Context.State[6], Context.State[7]);
SHA512_Init(&Context);
SHA512_Update(&Context, "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu",
sizeof("abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu") - 1);
SHA512_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu\"\n"
L"\t8e959b75dae313da 8cf4f72814fc143f 8f7779c6eb9f7fa1 7299aeadb6889018\n"
L"\t%016I64X %016I64X %016I64X %016I64X\n\t%016I64X %016I64X %016I64X %016I64X\n"
L"\t501d289e4900f7e4 331b99dec4b5433a c7d329eeb6dd2654 5e96e55b874be909\n",
Context.State[0], Context.State[1], Context.State[2], Context.State[3],
Context.State[4], Context.State[5], Context.State[6], Context.State[7]);
__stosb(cbMillion, 0, 1000);
SHA512_Init(&Context);
SHA512_Update(&Context, cbMillion, 111);
SHA512_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\'\\0\'×111\n"
L"\t77ddd3a542e530fd 047b8977c657ba6c e72f1492e360b2b2 212cd264e75ec038\n"
L"\t%016I64X %016I64X %016I64X %016I64X\n\t%016I64X %016I64X %016I64X %016I64X\n"
L"\t82e4ff0525517ab4 207d14c70c2259ba 88d4d335ee0e7e20 543d22102ab1788c\n",
Context.State[0], Context.State[1], Context.State[2], Context.State[3],
Context.State[4], Context.State[5], Context.State[6], Context.State[7]);
SHA512_Init(&Context);
SHA512_Update(&Context, cbMillion, 112);
SHA512_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\'\\0\'×112\n"
L"\t2be2e788c8a8adea a9c89a7f78904cac ea6e39297d75e057 3a73c756234534d6\n"
L"\t%016I64X %016I64X %016I64X %016I64X\n\t%016I64X %016I64X %016I64X %016I64X\n"
L"\t627ab4156b48a665 7b29ab8beb733340 40ad39ead81446bb 09c70704ec707952\n",
Context.State[0], Context.State[1], Context.State[2], Context.State[3],
Context.State[4], Context.State[5], Context.State[6], Context.State[7]);
SHA512_Init(&Context);
SHA512_Update(&Context, cbMillion, 113);
SHA512_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\'\\0\'×113\n"
L"\t0e67910bcf0f9ccd e5464c63b9c850a1 2a759227d16b040d 98986d54253f9f34\n"
L"\t%016I64X %016I64X %016I64X %016I64X\n\t%016I64X %016I64X %016I64X %016I64X\n"
L"\t322318e56b8feb86 c5fb2270ed87f312 52f7f68493ee7597 43909bd75e4bb544\n",
Context.State[0], Context.State[1], Context.State[2], Context.State[3],
Context.State[4], Context.State[5], Context.State[6], Context.State[7]);
SHA512_Init(&Context);
SHA512_Update(&Context, cbMillion, 122);
SHA512_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\'\\0\'×122\n"
L"\t4f3f095d015be4a7 a7cc0b8c04da4aa0 9e74351e3a97651f 744c23716ebd9b3e\n"
L"\t%016I64X %016I64X %016I64X %016I64X\n\t%016I64X %016I64X %016I64X %016I64X\n"
L"\t822e5077a01baa5c c0ed45b9249e88ab 343d4333539df21e d229da6f4a514e0f\n",
Context.State[0], Context.State[1], Context.State[2], Context.State[3],
Context.State[4], Context.State[5], Context.State[6], Context.State[7]);
SHA512_Init(&Context);
SHA512_Update(&Context, cbMillion, 1000);
SHA512_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\'\\0\'×1000\n"
L"\tca3dff61bb23477a a6087b27508264a6 f9126ee3a004f53c b8db942ed345f2f2\n"
L"\t%016I64X %016I64X %016I64X %016I64X\n\t%016I64X %016I64X %016I64X %016I64X\n"
L"\td229b4b59c859220 a1cf1913f34248e3 803bab650e849a3d 9a709edc09ae4a76\n",
Context.State[0], Context.State[1], Context.State[2], Context.State[3],
Context.State[4], Context.State[5], Context.State[6], Context.State[7]);
__stosb(cbMillion, 'A', 1000);
SHA512_Init(&Context);
SHA512_Update(&Context, cbMillion, 1000);
SHA512_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"A…A\"\n"
L"\t329c52ac62d1fe73 1151f2b895a00475 445ef74f50b979c6 f7bb7cae349328c1\n"
L"\t%016I64X %016I64X %016I64X %016I64X\n\t%016I64X %016I64X %016I64X %016I64X\n"
L"\td4cb4f7261a0ab43 f936a24b000651d4 a824fcdd577f211a ef8f806b16afe8af\n",
Context.State[0], Context.State[1], Context.State[2], Context.State[3],
Context.State[4], Context.State[5], Context.State[6], Context.State[7]);
__stosb(cbMillion, 'U', 1005);
SHA512_Init(&Context);
SHA512_Update(&Context, cbMillion, 1005);
SHA512_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"U…U\"\n"
L"\t59f5e54fe299c6a8 764c6b199e44924a 37f59e2b56c3ebad 939b7289210dc8e4\n"
L"\t%016I64X %016I64X %016I64X %016I64X\n\t%016I64X %016I64X %016I64X %016I64X\n"
L"\tc21b9720165b0f4d 4374c90f1bf4fb4a 5ace17a116179801 5052893a48c3d161\n",
Context.State[0], Context.State[1], Context.State[2], Context.State[3],
Context.State[4], Context.State[5], Context.State[6], Context.State[7]);
__stosb(cbMillion, 'a', 1000000);
SHA512_Init(&Context);
SHA512_Update(&Context, cbMillion, 1000000);
SHA512_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"a…a\"\n"
L"\te718483d0ce76964 4e2e42c7bc15b463 8e1f98b13b204428 5632a803afa973eb\n"
L"\t%016I64X %016I64X %016I64X %016I64X\n\t%016I64X %016I64X %016I64X %016I64X\n"
L"\tde0ff244877ea60a 4cb0432ce577c31b eb009c5c2c49aa2e 4eadb217ad8cc09b\n",
Context.State[0], Context.State[1], Context.State[2], Context.State[3],
Context.State[4], Context.State[5], Context.State[6], Context.State[7]);
__stosb(cbMillion, 0, 1000000);
SHA512_Init(&Context);
SHA512_Update(&Context, cbMillion, 1000000);
SHA512_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\'\\0\'×1000000\n"
L"\tce044bc9fd43269d 5bbc946cbebc3bb7 11341115cc4abdf2 edbc3ff2c57ad4b1\n"
L"\t%016I64X %016I64X %016I64X %016I64X\n\t%016I64X %016I64X %016I64X %016I64X\n"
L"\t5deb699bda257fea 5aef9c6e55fcf4cf 9dc25a8c3ce25f2e fe90908379bff7ed\n",
Context.State[0], Context.State[1], Context.State[2], Context.State[3],
Context.State[4], Context.State[5], Context.State[6], Context.State[7]);
PrintConsole(hConsole, L"\nTiming SHA-512 on %.48hs:\n", dwCPUID);
#ifdef CYCLES
if (!QueryThreadCycleTime(hThread, qwThread))
PrintConsole(hConsole,
L"QueryThreadCycleTime() returned error %lu\n",
dwError = GetLastError());
else
{
do
SHA512_Core(&Context);
while (--dwThread);
if (!QueryThreadCycleTime(hThread, qwThread + 1))
PrintConsole(hConsole,
L"QueryThreadCycleTime() returned error %lu\n",
dwError = GetLastError());
else
PrintConsole(hConsole,
L"%lu.%09lu clock cycles per byte\n",
__edivmodu(qwThread[1] - qwThread[0], 1000000000));
}
#else
if (!GetThreadTimes(hThread, (LPFILETIME) qwThread + 1, (LPFILETIME) qwThread + 1, (LPFILETIME) qwThread + 1, (LPFILETIME) qwThread))
PrintConsole(hConsole,
L"QueryThreadTimes() returned error %lu\n",
dwError = GetLastError());
else
{
do
SHA512_Core(&Context);
while (--dwThread);
if (!GetThreadTimes(hThread, (LPFILETIME) qwThread, (LPFILETIME) qwThread, (LPFILETIME) qwThread, (LPFILETIME) qwThread + 1))
PrintConsole(hConsole,
L"QueryThreadTimes() returned error %lu\n",
dwError = GetLastError());
else
PrintConsole(hConsole,
L"%lu.%07lu nano-seconds per byte\n",
__edivmodu(qwThread[1] - qwThread[0], 10000000));
}
#endif // CYCLES
}
ExitProcess(dwError);
}
Save the
ANSI C
source presented above as sha-512.c
next to the object
file sha-512.obj
assembled before, then run the
following 4 command lines to build the 64-bit console application
sha-512.exe
and execute it:
SET CL=/GAFS- /Gs1049600 /Oxy /W4 /Zl SET LINK=/ENTRY:wmainCRTStartup /NODEFAULTLIB /STACK:2097152,1048576 /SUBSYSTEM:CONSOLE CL.EXE /Fosha-512.tmp sha-512.c sha-512.obj kernel32.lib user32.lib .\sha-512.exeFor details and reference see the MSDN articles Compiler Options and Linker Options.
Note: if necessary, see the MSDN article Use the Microsoft C++ toolset from the command line for an introduction.
Note: the command lines can be copied and pasted as block into a Command Processor window.
Microsoft (R) C/C++ Optimizing Compiler Version 16.00.40219.01 for x64 Copyright (C) Microsoft Corporation. All rights reserved. sha-512.c Microsoft (R) Incremental Linker Version 10.00.40219.386 Copyright (C) Microsoft Corporation. All rights reserved. /ENTRY:mainCRTStartup /NODEFAULTLIB /STACK:2097152,1048576 /SUBSYSTEM:CONSOLE /out:sha-512.exe sha-512.tmp sha-512.obj kernel32.lib user32.lib Testing SHA-512 implementation... "" cf83e1357eefb8bd f1542850d66d8007 d620e4050b5715dc 83f4a921d36ce9ce CF83E1357EEFB8BD F1542850D66D8007 D620E4050B5715DC 83F4A921D36CE9CE 47D0D13C5D85F2B0 FF8318D2877EEC2F 63B931BD47417A81 A538327AF927DA3E 47d0d13c5d85f2b0 ff8318d2877eec2f 63b931bd47417a81 a538327af927da3e "abc" ddaf35a193617aba cc417349ae204131 12e6fa4e89a97ea2 0a9eeee64b55d39a DDAF35A193617ABA CC417349AE204131 12E6FA4E89A97EA2 0A9EEEE64B55D39A 2192992A274FC1A8 36BA3C23A3FEEBBD 454D4423643CE80E 2A9AC94FA54CA49F 2192992a274fc1a8 36ba3c23a3feebbd 454d4423643ce80e 2a9ac94fa54ca49f "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu" 8e959b75dae313da 8cf4f72814fc143f 8f7779c6eb9f7fa1 7299aeadb6889018 8E959B75DAE313DA 8CF4F72814FC143F 8F7779C6EB9F7FA1 7299AEADB6889018 501D289E4900F7E4 331B99DEC4B5433A C7D329EEB6DD2654 5E96E55B874BE909 501d289e4900f7e4 331b99dec4b5433a c7d329eeb6dd2654 5e96e55b874be909 '\0'×111 77ddd3a542e530fd 047b8977c657ba6c e72f1492e360b2b2 212cd264e75ec038 77DDD3A542E530FD 047B8977C657BA6C E72F1492E360B2B2 212CD264E75EC038 82E4FF0525517AB4 207D14C70C2259BA 88D4D335EE0E7E20 543D22102AB1788C 82e4ff0525517ab4 207d14c70c2259ba 88d4d335ee0e7e20 543d22102ab1788c '\0'×112 2be2e788c8a8adea a9c89a7f78904cac ea6e39297d75e057 3a73c756234534d6 2BE2E788C8A8ADEA A9C89A7F78904CAC EA6E39297D75E057 3A73C756234534D6 627AB4156B48A665 7B29AB8BEB733340 40AD39EAD81446BB 09C70704EC707952 627ab4156b48a665 7b29ab8beb733340 40ad39ead81446bb 09c70704ec707952 '\0'×113 0e67910bcf0f9ccd e5464c63b9c850a1 2a759227d16b040d 98986d54253f9f34 0E67910BCF0F9CCD E5464C63B9C850A1 2A759227D16B040D 98986D54253F9F34 322318E56B8FEB86 C5FB2270ED87F312 52F7F68493EE7597 43909BD75E4BB544 322318e56b8feb86 c5fb2270ed87f312 52f7f68493ee7597 43909bd75e4bb544 '\0'×122 4f3f095d015be4a7 a7cc0b8c04da4aa0 9e74351e3a97651f 744c23716ebd9b3e 4F3F095D015BE4A7 A7CC0B8C04DA4AA0 9E74351E3A97651F 744C23716EBD9B3E 822E5077A01BAA5C C0ED45B9249E88AB 343D4333539DF21E D229DA6F4A514E0F 822e5077a01baa5c c0ed45b9249e88ab 343d4333539df21e d229da6f4a514e0f '\0'×1000 ca3dff61bb23477a a6087b27508264a6 f9126ee3a004f53c b8db942ed345f2f2 CA3DFF61BB23477A A6087B27508264A6 F9126EE3A004F53C B8DB942ED345F2F2 D229B4B59C859220 A1CF1913F34248E3 803BAB650E849A3D 9A709EDC09AE4A76 d229b4b59c859220 a1cf1913f34248e3 803bab650e849a3d 9a709edc09ae4a76 "A…A" 329c52ac62d1fe73 1151f2b895a00475 445ef74f50b979c6 f7bb7cae349328c1 329C52AC62D1FE73 1151F2B895A00475 445EF74F50B979C6 F7BB7CAE349328C1 D4CB4F7261A0AB43 F936A24B000651D4 A824FCDD577F211A EF8F806B16AFE8AF d4cb4f7261a0ab43 f936a24b000651d4 a824fcdd577f211a ef8f806b16afe8af "U…U" 59f5e54fe299c6a8 764c6b199e44924a 37f59e2b56c3ebad 939b7289210dc8e4 59F5E54FE299C6A8 764C6B199E44924A 37F59E2B56C3EBAD 939B7289210DC8E4 C21B9720165B0F4D 4374C90F1BF4FB4A 5ACE17A116179801 5052893A48C3D161 c21b9720165b0f4d 4374c90f1bf4fb4a 5ace17a116179801 5052893a48c3d161 "a…a" e718483d0ce76964 4e2e42c7bc15b463 8e1f98b13b204428 5632a803afa973eb E718483D0CE76964 4E2E42C7BC15B463 8E1F98B13B204428 5632A803AFA973EB DE0FF244877EA60A 4CB0432CE577C31B EB009C5C2C49AA2E 4EADB217AD8CC09B de0ff244877ea60a 4cb0432ce577c31b eb009c5c2c49aa2e 4eadb217ad8cc09b '\0'×1000000 ce044bc9fd43269d 5bbc946cbebc3bb7 11341115cc4abdf2 edbc3ff2c57ad4b1 CE044BC9FD43269D 5BBC946CBEBC3BB7 11341115CC4ABDF2 EDBC3FF2C57AD4B1 5DEB699BDA257FEA 5AEF9C6E55FCF4CF 9DC25A8C3CE25F2E FE90908379BFF7ED 5deb699bda257fea 5aef9c6e55fcf4cf 9dc25a8c3ce25f2e fe90908379bff7ed Timing SHA-512 on Intel(R) Core(TM)2 Duo CPU P8700 @ 2.53GHz: 10.754902696 clock cycles per byteIn other units: 4.564422 seconds per GiB, 235.2 MB per second, or 2.81 instructions per clock cycle.
On a newer processor, running at 3.4 GHz:
[…] Timing SHA-512 on AMD Ryzen 7 5700X 8-Core Processor : 5.300458322 clock cycles per byteAlso in other units: 1.451893 seconds per GiB, 641.5 MB per second, or 5.71 instructions per clock cycle.
white paperNew Instructions Supporting the Secure Hash Algorithm on Intel® Architecture Processors, published July 17, 2013, Intel® announced the SHA Extensions alias SHA New Instructions,
SHA1MSG1
,
SHA1MSG2
,
SHA1NEXTE
plus
SHA1RNDS4
for calculation of
SHA-1, and
SHA256MSG1
,
SHA256MSG2
plus
SHA256RNDS2
for calculation of
SHA-256. They were
first supported in the Goldmont micro-architecture,
introduced August 30, 2016, with the Apollo Lake and
Denverton processor families.
Intel® SHA Extensions Implementations
AMD® Ryzen™ and EPYC™ processors, introduced February 2, 2017 respectively June 20, 2017, support them too.
The yet to come
VSHA512MSG1
,
VSHA512MSG2
and
VSHA512RNDS2
instructions for calculation of
SHA-512 are
documented since revision 049 of the
Intel® Architecture Instruction Set Extensions and Future Features Programming Reference,
published in June 2023.
SHA1MSG1
,
SHA1MSG2
,
SHA1NEXTE
and
SHA1RNDS4
InstructionsSHA1MSG1–Perform an Intermediate Calculation for the Next Four SHA1 Message Dwords
Opcode/Instruction Op/En 64/32 bit Mode Support CPUID Feature Flag Description NP 0F 38 C9 /r
SHA1MSG1 xmm1, xmm2/m128RM V/V SHA Performs an intermediate calculation for the next four SHA1 message dwords using previous message dwords from xmm1 and xmm2/m128, storing the result in xmm1.
Instruction Operand Encoding Op/En Operand 1 Operand 2 Operand 3 RM ModRM:reg (r, w) ModRM:r/m (r) N/A Description
The SHA1MSG1 instruction is one of two SHA1 message scheduling instructions. The instruction performs an intermediate calculation for the next four SHA1 message dwords.
Operation
SHA1MSG1
W0 := SRC1[127:96] ;
W1 := SRC1[95:64] ;
W2 := SRC1[63: 32] ;
W3 := SRC1[31: 0] ;
W4 := SRC2[127:96] ;
W5 := SRC2[95:64] ;
DEST[127:96] := W2 XOR W0;
DEST[95:64] := W3 XOR W1;
DEST[63:32] := W4 XOR W2;
DEST[31:0] := W5 XOR W3;
SHA1MSG2–Perform a Final Calculation for the Next Four SHA1 Message Dwords
Opcode/Instruction Op/En 64/32 bit Mode Support CPUID Feature Flag Description NP 0F 38 CA /r
SHA1MSG2 xmm1, xmm2/m128RM V/V SHA Performs the final calculation for the next four SHA1 message dwords using intermediate results from xmm1 and the previous message dwords from xmm2/m128, storing the result in xmm1.
Instruction Operand Encoding Op/En Operand 1 Operand 2 Operand 3 RM ModRM:reg (r, w) ModRM:r/m (r) N/A Description
The SHA1MSG2 instruction is one of two SHA1 message scheduling instructions. The instruction performs the final calculation to derive the next four SHA1 message dwords.
Operation
SHA1MSG2
W13 := SRC2[95:64] ;
W14 := SRC2[63: 32] ;
W15 := SRC2[31: 0] ;
W16 := (SRC1[127:96] XOR W13 ) ROL 1;
W17 := (SRC1[95:64] XOR W14) ROL 1;
W18 := (SRC1[63: 32] XOR W15) ROL 1;
W19 := (SRC1[31: 0] XOR W16) ROL 1;
DEST[127:96] := W16;
DEST[95:64] := W17;
DEST[63:32] := W18;
DEST[31:0] := W19;
SHA1NEXTE–Calculate SHA1 State Variable E After Four Rounds
Opcode/Instruction Op/En 64/32 bit Mode Support CPUID Feature Flag Description NP 0F 38 C8 /r
SHA1NEXTE xmm1, xmm2/m128RM V/V SHA Calculates SHA1 state variable E after four rounds of operation from the current SHA1 state variable A in xmm1. The calculated value of the SHA1 state variable E is added to the scheduled dwords in xmm2/m128, and stored with some of the scheduled dwords in xmm1.
Instruction Operand Encoding Op/En Operand 1 Operand 2 Operand 3 RM ModRM:reg (r, w) ModRM:r/m (r) N/A Description
The SHA1NEXTE calculates the SHA1 state variable E after four rounds of operation from the current SHA1 state variable A in the destination operand. The calculated value of the SHA1 state variable E is added to the source operand, which contains the scheduled dwords.
Operation
SHA1NEXTE
TMP := (SRC1[127:96] ROL 30);
DEST[127:96] := SRC2[127:96] + TMP;
DEST[95:64] := SRC2[95:64];
DEST[63:32] := SRC2[63:32];
DEST[31:0] := SRC2[31:0];
SHA1RNDS4–Perform Four Rounds of SHA1 Operation???
Opcode/Instruction Op/En 64/32 bit Mode Support CPUID Feature Flag Description NP 0F 3A CC /r ib
SHA1RNDS4 xmm1, xmm2/m128, imm8RMI V/V SHA Performs four rounds of SHA1 operation operating on SHA1 state (A,B,C,D) from xmm1, with a pre-computed sum of the next 4 round message dwords and state variable E from xmm2/m128. The immediate byte controls logic functions and round constants.
Instruction Operand Encoding Op/En Operand 1 Operand 2 Operand 3 RMI ModRM:reg (r, w) ModRM:r/m (r) imm8 Description
The SHA1RNDS4 instruction performs four rounds of SHA1 operation using an initial SHA1 state (A,B,C,D) from the first operand (which is a source operand and the destination operand) and some pre-computed sum of the next 4 round message dwords, and state variable E from the second operand (a source operand). The updated SHA1 state (A,B,C,D) after four rounds of processing is stored in the destination operand.
Operation
SHA1RNDS4
The function f() and Constant K are dependent on the value of the immediate.
IF ( imm8[1:0] = 0 )
THEN f() := f0(), K := K0;
ELSE IF ( imm8[1:0] = 1 )
THEN f() := f1(), K := K1;
ELSE IF ( imm8[1:0] = 2 )
THEN f() := f2(), K := K2;
ELSE IF ( imm8[1:0] = 3 )
THEN f() := f3(), K := K3;
FI;
A := SRC1[127:96];
B := SRC1[95:64];
C := SRC1[63:32];
D := SRC1[31:0];
W0E := SRC2[127:96];
W1 := SRC2[95:64];
W2 := SRC2[63:32];
W3 := SRC2[31:0];
Round i = 0 operation:
A_1 := f (B, C, D) + (A ROL 5) +W0E +K;
B_1 := A;
C_1 := B ROL 30;
D_1 := C;
E_1 := D;
FOR i = 1 to 3
A_(i +1) := f (B_i, C_i, D_i) + (A_i ROL 5) +Wi+ E_i +K;
B_(i +1) := A_i;
C_(i +1) := B_i ROL 30;
D_(i +1) := C_i;
E_(i +1) := D_i;
ENDFOR
DEST[127:96] := A_4;
DEST[95:64] := B_4;
DEST[63:32] := C_4;
DEST[31:0] := D_4;
; Copyright © 2017-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>
core textequ <SHA1_Core>
final textequ <SHA1_Final>
init textequ <SHA1_Init>
update textequ <SHA1_Update>
context struct 16 ; SHA1_CTX structure
state oword ? ; {a, b, c, d, e}
spare dword 2 dup (?)
count qword ?
block oword 4 dup (?)
context ends
.const
reverse oword 000102030405060708090A0B0C0D0E0Fh
.code
core proc public ; void SHA1_Core(SHA1_CTX *context)
movdqa [rsp+8], xmm6
movdqu xmm0, context.block[rcx]
movdqu xmm1, context.block[rcx+16]
movdqu xmm2, context.block[rcx+32]
movdqu xmm3, context.block[rcx+48]
movdqu xmm4, context.state[rcx]; xmm4 = {a, b, c, d}
movdqu xmm5, context.state[rcx+16]
; xmm5 = {e, *, *, *}
movdqa xmm6, reverse
pslldq xmm5, 12 ; xmm5 = {0, 0, 0, e}
pshufd xmm4, xmm4, 27 ; xmm4 = {d, c, b, a}
pshufb xmm3, xmm6 ; xmm3 = {W15, W14, W13, W12}
pshufb xmm2, xmm6 ; xmm2 = {W11, W10, W9, W8}
pshufb xmm1, xmm6 ; xmm1 = {W7, W6, W5, W4}
pshufb xmm0, xmm6 ; xmm0 = {W3, W2, W1, W0}
movdqa [rsp+24], xmm5
; round 0 to 3
paddd xmm5, xmm0
movdqa xmm6, xmm4
sha1rnds4 xmm4, xmm5, 0
; round 4 to 7
sha1nexte xmm6, xmm1
movdqa xmm5, xmm4
sha1rnds4 xmm4, xmm6, 0
sha1msg1 xmm0, xmm1
; round 8 to 11
sha1nexte xmm5, xmm2
movdqa xmm6, xmm4
sha1rnds4 xmm4, xmm5, 0
sha1msg1 xmm1, xmm2
pxor xmm0, xmm2
; round 12 to 15
sha1nexte xmm6, xmm3
movdqa xmm5, xmm4
sha1msg2 xmm0, xmm3
sha1rnds4 xmm4, xmm6, 0
sha1msg1 xmm2, xmm3
pxor xmm1, xmm3
; round 16 to 19
sha1nexte xmm5, xmm0
movdqa xmm6, xmm4
sha1msg2 xmm1, xmm0
sha1rnds4 xmm4, xmm5, 0
sha1msg1 xmm3, xmm0
pxor xmm2, xmm0
; round 20 to 23
sha1nexte xmm6, xmm1
movdqa xmm5, xmm4
sha1msg2 xmm2, xmm1
sha1rnds4 xmm4, xmm6, 1
sha1msg1 xmm0, xmm1
pxor xmm3, xmm1
; round 24 to 27
sha1nexte xmm5, xmm2
movdqa xmm6, xmm4
sha1msg2 xmm3, xmm2
sha1rnds4 xmm4, xmm5, 1
sha1msg1 xmm1, xmm2
pxor xmm0, xmm2
; round 28 to 31
sha1nexte xmm6, xmm3
movdqa xmm5, xmm4
sha1msg2 xmm0, xmm3
sha1rnds4 xmm4, xmm6, 1
sha1msg1 xmm2, xmm3
pxor xmm1, xmm3
; round 32 to 35
sha1nexte xmm5, xmm0
movdqa xmm6, xmm4
sha1msg2 xmm1, xmm0
sha1rnds4 xmm4, xmm5, 1
sha1msg1 xmm3, xmm0
pxor xmm2, xmm0
; round 36 to 39
sha1nexte xmm6, xmm1
movdqa xmm5, xmm4
sha1msg2 xmm2, xmm1
sha1rnds4 xmm4, xmm6, 1
sha1msg1 xmm0, xmm1
pxor xmm3, xmm1
; round 40 to 43
sha1nexte xmm5, xmm2
movdqa xmm6, xmm4
sha1msg2 xmm3, xmm2
sha1rnds4 xmm4, xmm5, 2
sha1msg1 xmm1, xmm2
pxor xmm0, xmm2
; round 44 to 47
sha1nexte xmm6, xmm3
movdqa xmm5, xmm4
sha1msg2 xmm0, xmm3
sha1rnds4 xmm4, xmm6, 2
sha1msg1 xmm2, xmm3
pxor xmm1, xmm3
; round 48 to 51
sha1nexte xmm5, xmm0
movdqa xmm6, xmm4
sha1msg2 xmm1, xmm0
sha1rnds4 xmm4, xmm5, 2
sha1msg1 xmm3, xmm0
pxor xmm2, xmm0
; round 52 to 55
sha1nexte xmm6, xmm1
movdqa xmm5, xmm4
sha1msg2 xmm2, xmm1
sha1rnds4 xmm4, xmm6, 2
sha1msg1 xmm0, xmm1
pxor xmm3, xmm1
; round 56 to 59
sha1nexte xmm5, xmm2
movdqa xmm6, xmm4
sha1msg2 xmm3, xmm2
sha1rnds4 xmm4, xmm5, 2
sha1msg1 xmm1, xmm2
pxor xmm0, xmm2
; round 60 to 63
sha1nexte xmm6, xmm3
movdqa xmm5, xmm4
sha1msg2 xmm0, xmm3
sha1rnds4 xmm4, xmm6, 3
sha1msg1 xmm2, xmm3
pxor xmm1, xmm3
; round 64 to 67
sha1nexte xmm5, xmm0
movdqa xmm6, xmm4
sha1msg2 xmm1, xmm0
sha1rnds4 xmm4, xmm5, 3
sha1msg1 xmm3, xmm0
pxor xmm2, xmm0
; round 68 to 71
sha1nexte xmm6, xmm1
movdqa xmm5, xmm4
sha1msg2 xmm2, xmm1
sha1rnds4 xmm4, xmm6, 3
pxor xmm3, xmm1
; round 72 to 75
sha1nexte xmm5, xmm2
movdqa xmm6, xmm4
sha1msg2 xmm3, xmm2
sha1rnds4 xmm4, xmm5, 3
; round 76 to 79
sha1nexte xmm6, xmm3
movdqa xmm5, xmm4
sha1rnds4 xmm4, xmm6, 3
movdqa xmm6, [rsp+8]
movdqu xmm0, context.state[rcx]; xmm0 = {a, b, c, d}
movdqa xmm1, [rsp+24] ; xmm1 = {0, 0, 0, e}
sha1nexte xmm5, xmm1 ; xmm5 = {*, *, *, e"}
pshufd xmm4, xmm4, 27 ; xmm4 = {a', b', c', d'}
paddd xmm4, xmm0 ; xmm4 = {a', b', c', d'}
; + {a, b, c, d}
; = {a", b", c", d"}
movdqu context.state[rcx], xmm4
pextrd context.spare[rcx], xmm5, 3
; state = {a", b", c", d", e"}
ret
core endp
final proc public ; void SHA1_Final(SHA1_CTX *context,
; const char digest[20])
mov r9, rcx ; r9 = address of context structure
mov rcx, context.count[r9] ; rcx = count
and ecx, sizeof context.block - 1
; ecx = number of bytes in block
; = index of first free byte in block
push rdi
lea rdi, context.block[r9+rcx]
mov r8, rdi ; r8 = address of first free byte in block
pad_1:
mov al, 10000000y
stosb ; block[index] = 0b10000000
pad_0:
xor eax, eax ; rax = 0
xor ecx, sizeof context.block - 1
; ecx = number of free bytes in block - 1
; = 63 - index
rep stosb ; block[index + 1, 63] = 0
sub rdi, sizeof context.count
; rdi = address of last qword in block
cmp r8, rdi
jb short pad_count ; space for count available in block?
; index < 56?
mov rcx, r9 ; rcx = address of context structure
push r9
push r8
push rdx
push rax
sub rsp, 32 ; "home space"
call core
add rsp, 32
pop rax
pop rdx
pop r8
pop r9
pad_block:
mov rcx, rdi ; rcx = address of last qword in block
lea rdi, context.block[r9] ; rdi = address of block
;; xor eax, eax ; rax = 0
sub rcx, rdi ; rcx = number of bytes before last qword
; = 56
rep stosb ; block[0, 55] = 0,
; rdi = address of last qword in block
pad_count:
mov rax, context.count[r9] ; rax = count
shl rax, 3 ; rax = count * 8
; = number of message bits
bswap rax ; rax = htonll(number of message bits)
stosq ; block[56, 63] = number of message bits
mov rcx, r9 ; rcx = address of context structure
push r9
push rdx
sub rsp, 32 ; "home space"
call core
add rsp, 32
pop rdi ; rdi = address of digest
mov r9, rsi
pop rsi ; rsi = address of state
mov ecx, lengthof context.state
digest:
lodsd
ifndef ALIGNED
rol eax, 8
stosb
rol eax, 8
stosb
rol eax, 8
stosb
rol eax, 8
stosb
else ; ALIGNED
bswap eax
stosd
endif ; ALIGNED
dec ecx
jnz short digest
mov rsi, r9
pop rdi
ret
final endp
init proc public ; void SHA1_Init(SHA1_CTX *context)
mov rax, 0EFCDAB8967452301h ; rax = ntohll(0x0123456789ABCDEF)
; = H1 << 32 | H0
mov context.count[rcx-24], rax
; state[0] = H0,
; state[1] = H1
mov rax, 01032547698BADCFEh ; rax = ntohll(0xFEDCBA9876543210)
; = H3 << 32 | H2
mov context.count[rcx-16], rax
; state[2] = H2,
; state[3] = H3
mov eax, 0C3D2E1F0h ; eax = H4
mov context.spare[rcx], eax ; state[4] = H4
xor eax, eax
mov context.count[rcx], rax ; count = 0
ret
init endp
update proc public ; void SHA1_Update(SHA1_CTX *context,
; void const *data,
; unsigned int size)
test r8, r8
jz short none ; no data?
mov r9, rcx ; r9 = address of context structure
mov rcx, context.count[r9] ; rcx = count
and ecx, sizeof context.block - 1
; ecx = number of bytes in block
; = index of first free byte in block
add context.count[r9], r8 ; count += number of bytes in data
push rsi
mov rsi, rdx ; rsi = address of data
push rdi
data:
lea rdi, context.block[r9+rcx]
; rdi = address of first free byte in block
xor ecx, sizeof context.block - 1
inc ecx ; rcx = number of free bytes in block
sub r8, rcx ; r8 = number of bytes in data
; - number of free bytes in block
jb short last ; number of bytes in data < number of free bytes in block?
more:
rep movsb ; rsi = address of remaining data
mov rdi, r9
mov rcx, r9 ; rcx = address of context structure
push r8
sub rsp, 32 ; "home space"
call core
add rsp, 32
pop r8
mov r9, rdi
xor ecx, ecx ; rcx = 0 = index of first free byte in block
test r8, r8
jnz short data ; more data?
pop rdi
pop rsi
none:
ret
last:
add rcx, r8 ; rcx = number of bytes in data
rep movsb
pop rdi
pop rsi
ret
update endp
end
Note: the function SHA1_Core()
has
132 instructions in 572 bytes plus 16 bytes read-only data.
Microsoft Macro Assembler Reference
Save the AMD64 assembler source presented above as
sha-1.asm
in an arbitrary, preferable empty
directory, then execute the following 2 command lines to generate
the 64-bit object file sha-1.obj
:
SET ML=/c /W3 /X ML64.EXE /DALIGNED sha-1.asmFor details and reference see the MSDN article ML and ML64 Command-Line Reference.
Note: if necessary, see the MSDN article Use the Microsoft C++ toolset from the command line for an introduction.
Note: the command lines can be copied and pasted as block into a Command Processor window.
Microsoft (R) Macro Assembler Version (x64) 14.16.27023.1 Copyright (C) Microsoft Corporation. All rights reserved. Assembling: sha-1.asmSave the following ANSI C header file as
sha-1.h
to #include
it in
your
ANSI C
sources:
// Copyleft © 2004-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>
typedef struct _SHA1_CTX { // NOTE: should be 16-byte aligned!
unsigned int state[5], spare, count[2], block[16];
} SHA1_CTX;
extern void SHA1_Core(SHA1_CTX *context);
extern void SHA1_Final(SHA1_CTX *context, unsigned char digest[20]);
extern void SHA1_Init(SHA1_CTX *context);
extern void SHA1_Update(SHA1_CTX *context, void const *data, unsigned int size);
// Copyright © 2004-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>
#define STRICT
#define UNICODE
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
__declspec(align(16))
typedef struct _SHA1_CTX
{
DWORD State[5]; // {0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0}
DWORD Spare;
DWORD64 Count; // 0
DWORD Block[16];
} SHA1_CTX;
VOID CDECL SHA1_Core(SHA1_CTX *Context);
VOID CDECL SHA1_Final(SHA1_CTX *Context, BYTE Digest[20]);
VOID CDECL SHA1_Init(SHA1_CTX *Context);
VOID CDECL SHA1_Update(SHA1_CTX *Context, LPCVOID Data, DWORD Size);
#ifndef _M_IX86
#define __edivmodu(N, D) (DWORD) ((N) / (D)), (DWORD) ((N) % (D))
#else
__forceinline // companion for __emulu()
struct
{
DWORD ulQuotient, ulRemainder;
} CDECL __edivmodu(DWORD64 ullDividend, DWORD ulDivisor)
{
__asm mov eax, dword ptr ullDividend
__asm mov edx, dword ptr ullDividend+4
__asm div ulDivisor
}
#endif // _M_IX86
__declspec(safebuffers)
BOOL CDECL PrintConsole(HANDLE hConsole, [SA_FormatString(Style="printf")] LPCWSTR lpFormat, ...)
{
WCHAR szOutput[1024];
DWORD dwOutput;
DWORD dwConsole;
va_list vaInput;
va_start(vaInput, lpFormat);
dwOutput = wvsprintf(szOutput, lpFormat, vaInput);
va_end(vaInput);
if ((dwOutput == 0)
|| !WriteConsole(hConsole, szOutput, dwOutput, &dwConsole, NULL))
return FALSE;
return dwConsole == dwOutput;
}
__declspec(noreturn)
VOID CDECL wmainCRTStartup(VOID)
{
SHA1_CTX Context;
BYTE cbDigest[20], cbMillion[1000000];
DWORD dwDigest = 16777216;
DWORD dwCPUID[16];
DWORD dwError = ERROR_SUCCESS;
DWORD dwThread = 1000000000 / 64;
DWORD64 qwThread[2];
HANDLE hThread = GetCurrentThread();
HANDLE hConsole = GetStdHandle(STD_ERROR_HANDLE);
if (hConsole == INVALID_HANDLE_VALUE)
dwError = GetLastError();
else
{
__cpuid(dwCPUID, 0x80000000);
if (*dwCPUID >= 0x80000004)
{
__cpuid(dwCPUID + 4, 0x80000002);
__cpuid(dwCPUID + 8, 0x80000003);
__cpuid(dwCPUID + 12, 0x80000004);
}
else
__movsb(dwCPUID, "unidentified processor", sizeof("unidentified processor"));
__cpuid(dwCPUID, 7);
if (dwCPUID[1] & (1 << 29) == 0)
PrintConsole(hConsole, L"SHA-NI instructions not supported on %.48hs!\n", dwCPUID + 4);
else
{
if (SetThreadIdealProcessor(hThread, 0) == -1)
PrintConsole(hConsole,
L"SetThreadIdealProcessor() returned error %lu\n",
dwError = GetLastError());
if (!SetThreadPriority(hThread, THREAD_PRIORITY_HIGHEST))
PrintConsole(hConsole,
L"SetThreadPriority() returned error %lu\n",
dwError = GetLastError());
PrintConsole(hConsole, L"\nTesting SHA-1 implementation...\n");
SHA1_Init(&Context);
SHA1_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"\"\n"
L"\tda39a3ee 5e6b4b0d 3255bfef 95601890 afd80709\n"
L"\t%08lX %08lX %08lX %08lX %08lX\n",
Context.State[0], Context.State[1], Context.State[2], Context.State[3], Context.State[4]);
SHA1_Init(&Context);
SHA1_Update(&Context, "abc", 3);
SHA1_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"abc\"\n"
L"\ta9993e36 4706816a ba3e2571 7850c26c 9cd0d89d\n"
L"\t%08lX %08lX %08lX %08lX %08lX\n",
Context.State[0], Context.State[1], Context.State[2], Context.State[3], Context.State[4]);
SHA1_Init(&Context);
SHA1_Update(&Context, "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq",
sizeof("abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq") - 1);
SHA1_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq\"\n"
L"\t84983e44 1c3bd26e baae4aa1 f95129e5 e54670f1\n"
L"\t%08lX %08lX %08lX %08lX %08lX\n",
Context.State[0], Context.State[1], Context.State[2], Context.State[3], Context.State[4]);
SHA1_Init(&Context);
SHA1_Update(&Context, "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu",
sizeof("abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu") - 1);
SHA1_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu\"\n"
L"\ta49b2446 a02c645b f419f995 b6709125 3a04a259\n"
L"\t%08lX %08lX %08lX %08lX %08lX\n",
Context.State[0], Context.State[1], Context.State[2], Context.State[3], Context.State[4]);
SHA1_Init(&Context);
SHA1_Update(&Context, "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghij",
sizeof("abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghij") - 1);
SHA1_Update(&Context, "klmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu",
sizeof("klmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu") - 1);
SHA1_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghij\".\"klmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu\"\n"
L"\ta49b2446 a02c645b f419f995 b6709125 3a04a259\n"
L"\t%08lX %08lX %08lX %08lX %08lX\n",
Context.State[0], Context.State[1], Context.State[2], Context.State[3], Context.State[4]);
__stosb(cbMillion, 'a', sizeof(cbMillion));
SHA1_Init(&Context);
SHA1_Update(&Context, cbMillion, sizeof(cbMillion));
SHA1_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"a…a\"\n"
L"\t34aa973c d4c4daa4 f61eeb2b dbad2731 6534016f\n"
L"\t%08lX %08lX %08lX %08lX %08lX\n",
Context.State[0], Context.State[1], Context.State[2], Context.State[3], Context.State[4]);
SHA1_Init(&Context);
do
SHA1_Update(&Context, "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmno",
sizeof("abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmno") - 1);
while (--dwDigest);
SHA1_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmno\"×16777216\n"
L"\t7789f0c9 ef7bfc40 d9331114 3dfbe69e 2017f592\n"
L"\t%08lX %08lX %08lX %08lX %08lX\n",
Context.State[0], Context.State[1], Context.State[2], Context.State[3], Context.State[4]);
PrintConsole(hConsole, L"\nTiming SHA-1 on %.48hs:\n", dwCPUID + 4);
#ifdef CYCLES
if (!QueryThreadCycleTime(hThread, qwThread))
PrintConsole(hConsole,
L"QueryThreadCycleTime() returned error %lu\n",
dwError = GetLastError());
else
{
do
SHA1_Core(&Context);
while (--dwThread);
if (!QueryThreadCycleTime(hThread, qwThread + 1))
PrintConsole(hConsole,
L"QueryThreadCycleTime() returned error %lu\n",
dwError = GetLastError());
else
PrintConsole(hConsole,
L"%lu.%09lu clock cycles per byte\n",
__edivmodu(qwThread[1] - qwThread[0], 1000000000));
}
#else
if (!GetThreadTimes(hThread, (LPFILETIME) qwThread + 1, (LPFILETIME) qwThread + 1, (LPFILETIME) qwThread + 1, (LPFILETIME) qwThread))
PrintConsole(hConsole,
L"QueryThreadTimes() returned error %lu\n",
dwError = GetLastError());
else
{
do
SHA1_Core(&Context);
while (--dwThread);
if (!GetThreadTimes(hThread, (LPFILETIME) qwThread, (LPFILETIME) qwThread, (LPFILETIME) qwThread, (LPFILETIME) qwThread + 1))
PrintConsole(hConsole,
L"QueryThreadTimes() returned error %lu\n",
dwError = GetLastError());
else
PrintConsole(hConsole,
L"%lu.%07lu nano-seconds per byte\n",
__edivmodu(qwThread[1] - qwThread[0], 10000000));
}
#endif // CYCLES
}
}
ExitProcess(dwError);
}
Save the
ANSI C
source presented above as sha-1.c
next to the object
file sha-1.obj
assembled before, then run the following
4 command lines to build the console application
sha-1.exe
and execute it:
SET CL=/GAFS- /Gs1049600 /Oxy /W4 /Zl SET LINK=/ENTRY:wmainCRTStartup /NODEFAULTLIB /STACK:2097152,1048576 /SUBSYSTEM:CONSOLE CL.EXE /DCYCLES /Fosha-1.tmp sha-1.c sha-1.obj kernel32.lib user32.lib .\sha-1.exeFor details and reference see the MSDN articles Compiler Options and Linker Options.
Note: if necessary, see the MSDN article Use the Microsoft C++ toolset from the command line for an introduction.
Note: the command lines can be copied and pasted as block into a Command Processor window.
Microsoft (R) C/C++ Optimizing Compiler Version 16.00.40219.01 for x64 Copyright (C) Microsoft Corporation. All rights reserved. sha-1.c Microsoft (R) Incremental Linker Version 10.00.40219.386 Copyright (C) Microsoft Corporation. All rights reserved. /ENTRY:mainCRTStartup /NODEFAULTLIB /STACK:2097152,1048576 /SUBSYSTEM:CONSOLE /out:sha-1.exe sha-1.tmp sha-1.obj kernel32.lib user32.lib Testing SHA-1 implementation... "" da39a3ee 5e6b4b0d 3255bfef 95601890 afd80709 DA39A3EE 5E6B4B0D 3255BFEF 95601890 AFD80709 "abc" a9993e36 4706816a ba3e2571 7850c26c 9cd0d89d A9993E36 4706816A BA3E2571 7850C26C 9CD0D89D "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq" 84983e44 1c3bd26e baae4aa1 f95129e5 e54670f1 84983E44 1C3BD26E BAAE4AA1 F95129E5 E54670F1 "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu" a49b2446 a02c645b f419f995 b6709125 3a04a259 A49B2446 A02C645B F419F995 B6709125 3A04A259 "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghij"."klmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu" a49b2446 a02c645b f419f995 b6709125 3a04a259 A49B2446 A02C645B F419F995 B6709125 3A04A259 "a…a" 34aa973c d4c4daa4 f61eeb2b dbad2731 6534016f 34AA973C D4C4DAA4 F61EEB2B DBAD2731 6534016F "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmno"×16777216 7789f0c9 ef7bfc40 d9331114 3dfbe69e 2017f592 7789F0C9 EF7BFC40 D9331114 3DFBE69E 2017F592 Timing SHA-1 on AMD Ryzen 7 5700X 8-Core Processor : 1.647124968 clock cycles per byteAt the nominal clock frequency of 3.4 GHz this should be a little more than 2 GB per second.
Build the console application sha-1.exe
a second time,
now without the preprocessor macro CYCLES
defined, and
execute it:
CL.EXE /Fosha-1.tmp sha-1.c sha-1.obj .\sha-1.exe
[…] Timing SHA-1 on AMD Ryzen 7 5700X 8-Core Processor : 1.0156250 nano-seconds per byteOOPS: measuring execution time instead of clock cycles yields but 984.6 MB per second, and the quotient clock cycles ⁄ nano-seconds gives an effective clock frequency of 1.622 GHz instead of the nominal 3.4 GHz, i.e. Ryzen™ processors apparently lower their clock frequency while executing SHA New Instructions!
Repetition of the measurement on another processor confirms this undocumented misbehaviour:
[…] Timing SHA-1 on AMD Ryzen 7 2700X Eight-Core Processor : 1.919058465 clock cycles per byteAt the nominal clock frequency of 3.2 GHz this should be 1667.5 MB per second.
[…] Timing SHA-1 on AMD Ryzen 7 2700X Eight-Core Processor : 1.1875000 nano-seconds per byteOUCH: measuring execution time instead of clock cycles yields but 842.1 MB per second, and the quotient clock cycles ⁄ nano-seconds gives an effective clock frequency of 1.616 GHz instead of the nominal 3.2 GHz, i.e. Ryzen™ processors apparently lower their clock frequency while executing SHA New Instructions!
SHA256MSG1
,
SHA256MSG2
and
SHA256RNDS2
InstructionsSHA256MSG1–Perform an Intermediate Calculation for the Next Four SHA256 Message Dwords
Opcode/Instruction Op/En 64/32 bit Mode Support CPUID Feature Flag Description NP 0F 38 CC /r
SHA256MSG1 xmm1, xmm2/m128RM V/V SHA Performs an intermediate calculation for the next four SHA256 message dwords using previous message dwords from xmm1 and xmm2/m128, storing the result in xmm1.
Instruction Operand Encoding Op/En Operand 1 Operand 2 Operand 3 RM ModRM:reg (r, w) ModRM:r/m (r) N/A Description
The SHA256MSG1 instruction is one of two SHA256 message scheduling instructions. The instruction performs an intermediate calculation for the next four SHA256 message dwords.
Operation
SHA256MSG1
W4 := SRC2[31: 0] ;
W3 := SRC1[127:96] ;
W2 := SRC1[95:64] ;
W1 := SRC1[63: 32] ;
W0 := SRC1[31: 0] ;
DEST[127:96] := W3 + σ0( W4);
DEST[95:64] := W2 + σ0( W3);
DEST[63:32] := W1 + σ0( W2);
DEST[31:0] := W0 + σ0( W1);
SHA256MSG2–Perform a Final Calculation for the Next Four SHA256 Message Dwords
Opcode/Instruction Op/En 64/32 bit Mode Support CPUID Feature Flag Description NP 0F 38 CD /r
SHA256MSG2 xmm1, xmm2/m128RM V/V SHA Performs the final calculation for the next four SHA256 message dwords using previous message dwords from xmm1 and xmm2/m128, storing the result in xmm1.
Instruction Operand Encoding Op/En Operand 1 Operand 2 Operand 3 RM ModRM:reg (r, w) ModRM:r/m (r) N/A Description
The SHA256MSG2 instruction is one of two SHA2 message scheduling instructions. The instruction performs the final calculation for the next four SHA256 message dwords.
Operation
SHA256MSG2
W14 := SRC2[95:64] ;
W15 := SRC2[127:96] ;
W16 := SRC1[31: 0] + σ1( W14) ;
W17 := SRC1[63: 32] + σ1( W15) ;
W18 := SRC1[95: 64] + σ1( W16) ;
W19 := SRC1[127: 96] + σ1( W17) ;
DEST[127:96] := W19 ;
DEST[95:64] := W18 ;
DEST[63:32] := W17 ;
DEST[31:0] := W16;
SHA256RNDS2–Perform Two Rounds of SHA256 Operation???
Opcode/Instruction Op/En 64/32 bit Mode Support CPUID Feature Flag Description NP 0F 38 CB /r
SHA256RNDS2 xmm1, xmm2/m128, <XMM0>RMI V/V SHA Perform 2 rounds of SHA256 operation using an initial SHA256 state (C,D,G,H) from xmm1, an initial SHA256 state (A,B,E,F) from xmm2/m128, and a pre-computed sum of the next 2 round message dwords and the corresponding round constants from the implicit operand XMM0, storing the updated SHA256 state (A,B,E,F) result in xmm1.
Instruction Operand Encoding Op/En Operand 1 Operand 2 Operand 3 RMI ModRM:reg (r, w) ModRM:r/m (r) Implicit XMM0 (r) Description
The SHA256RNDS2 instruction performs 2 rounds of SHA256 operation using an initial SHA256 state (C,D,G,H) from the first operand, an initial SHA256 state (A,B,E,F) from the second operand, and a pre-computed sum of the next 2 round message dwords and the corresponding round constants from the implicit operand xmm0. Note that only the two lower dwords of XMM0 are used by the instruction.
The updated SHA256 state (A,B,E,F) is written to the first operand, and the second operand can be used as the updated state (C,D,G,H) in later rounds.
Operation
SHA256RNDS2
A_0 := SRC2[127:96];
B_0 := SRC2[95:64];
C_0 := SRC1[127:96];
D_0 := SRC1[95:64];
E_0 := SRC2[63:32];
F_0 := SRC2[31:0];
G_0 := SRC1[63:32];
H_0 := SRC1[31:0];
WK0 := XMM0[31: 0];
WK1 := XMM0[63: 32];
FOR i = 0 to 1
A_(i +1) := Ch (E_i, F_i, G_i) +Σ1( E_i) +WKi+ H_i + Maj(A_i , B_i, C_i) +Σ0( A_i);
B_(i +1) := A_i;
C_(i +1) := B_i ;
D_(i +1) := C_i;
E_(i +1) := Ch (E_i, F_i, G_i) +Σ1( E_i) +WKi+ H_i + D_i;
F_(i +1) := E_i ;
G_(i +1) := F_i;
H_(i +1) := G_i;
ENDFOR
DEST[127:96] := A_2;
DEST[95:64] := B_2;
DEST[63:32] := E_2;
DEST[31:0] := F_2;
# Copyright © 2017-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>
.ident "Copyright (C) 2017-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>"
.file "sha-256.s"
.arch znver1
.code32
.att_syntax
.struct 0 # SHA256_CTX structure
state: # 8 double words: {h, g, d, c, f, e, b, a}
.space 32
block: # 16 double words
.space 64
count: # 2 double words
.space 8
.section .const, "ar", @progbits
.align 64
reverse: # constants for endian conversion
.octa 0x000102030405060708090A0B0C0D0E0F
endian:
.long 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
hgdc: # SHA-256 start values: H7, H6, H3, H2
.long 0x5BE0CD19, 0x1F83D9AB, 0xA54FF53A, 0x3C6EF372
feba: # SHA-256 start values: H5, H4, H1, H0
.long 0x9B05688C, 0x510E527F, 0xBB67AE85, 0x6A09E667
k256: # SHA-256 round constants
.long 0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5
.long 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5
.long 0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3
.long 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174
.long 0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC
.long 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA
.long 0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7
.long 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967
.long 0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13
.long 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85
.long 0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3
.long 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070
.long 0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5
.long 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3
.long 0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208
.long 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
.size k256, .-k256
.type k256, @object
.macro sha256 r0 :req, r1 :req, r2 :req, r3 :req
.if \@ < 4
movdqu block+16*\@(%edx), \r0
pshufb %xmm3, \r0 # \r0 = {ntohl(block[\@*16]), ntohl(block[\@*16+1]),
# ntohl(block[\@*16+2]), ntohl(block[\@*16+3]}
# = {w[\@*16], w[\@*16+1], w[\@*16+2], w[\@*16+3]}
.else
# \r0 = {w[\@*16-16], w[\@*16-15], w[\@*16-14], w[\@*16-13]},
# \r1 = {w[\@*16-12], w[\@*16-11], w[\@*16-10], w[\@*16-9]},
# \r2 = {w[\@*16-8], w[\@*16-7], w[\@*16-6], w[\@*16-5]},
# \r3 = {w[\@*16-4], w[\@*16-3], w[\@*16-2], w[\@*16-1]}
sha256msg1 \r1, \r0
movdqa \r3, %xmm0
palignr $4, \r2, %xmm0 # xmm0 = {w[\@*16-7], w[\@*16-6], w[\@*16-5], w[\@*16-4]}
paddd %xmm0, \r0
sha256msg2 \r3, \r0 # \r0 = {w[\@*16], w[\@*16+1], w[\@*16+2], w[\@*16+3]}
.endif
movdqa \@*16-128(%ecx), %xmm0 # xmm0 = {k[\@*16], k[\@*16+1], k[\@*16+2], k[\@*16+3]}
paddd \r0, %xmm0 # xmm0 = {(w+k)[\@*16], (w+k)[\@*16+1], (w+k)[\@*16+2], (w+k)[\@*16+3]}
sha256rnds2 %xmm1, %xmm2 # xmm2 = {f', e', b', a'},
# xmm1 = {h', g', d', c'}
punpckhqdq %xmm0, %xmm0 # xmm0 = {(w+k)[\@*16+2], (w+k)[\@*16+3], (w+k)[\@*16+2], (w+k)[\@*16+3]}
sha256rnds2 %xmm2, %xmm1 # xmm1 = {h", g", d", c"},
# xmm2 = {f", e", b", a"}
.endm
.text
sha256_core: # void SHA256_Core(SHA256_CTX *context)
leal k256+128, %ecx # ecx = address of round constants
movl 4(%esp), %edx # edx = address of context structure
movdqu state(%edx), %xmm1 # xmm1 = {h, g, d, c}
movdqu state+16(%edx), %xmm2 # xmm2 = {f, e, b, a}
movdqa endian, %xmm3 # xmm3 = constant for endian conversion
.rept 4 # 4*4*4 rounds
sha256 %xmm4, %xmm5, %xmm6, %xmm7
sha256 %xmm5, %xmm6, %xmm7, %xmm4
sha256 %xmm6, %xmm7, %xmm4, %xmm5
sha256 %xmm7, %xmm4, %xmm5, %xmm6
.endr
movdqu state(%edx), %xmm6 # xmm6 = {h, g, d, c}
movdqu state+16(%edx), %xmm7 # xmm7 = {f, e, b, a}
paddd %xmm6, %xmm1 # xmm1 = {h', g', d', c'} + {h, g, d, c}
# = {h", g", d", c"}
paddd %xmm7, %xmm2 # xmm2 = {f', e', b', a'} + {f, e, b, a}
# = {f", e", b", a"}
movdqu %xmm1, state(%edx)
movdqu %xmm2, state+16(%edx) # state = {h", g", d", c", f", e", b", a"}
retl
.global sha256_core
.size sha256_core, .-sha256_core
.type sha256_core, @function
sha256_final: # void SHA256_Final(SHA256_CTX *context,
# unsigned char digest[32])
movl 4(%esp), %edx # edx = address of context structure
movl count(%edx), %ecx # ecx = low double word of count
andl $63, %ecx # ecx = number of bytes in block
# = index of first free byte in block
pushl %edi
leal block(%edx, %ecx), %edi # edi = address of first free byte in block
pushl %edi
.Lpad_1:
movb $0b10000000, %al
stosb # block[index] = 0b10000000
.Lpad_0:
xorl %eax, %eax # eax = 0
xorl $63, %ecx # ecx = number of free bytes in block - 1
# = 63 - index
rep stosb # block[index + 1, 63] = 0
subl $8, %edi # edi = address of last quad word in block
popl %eax # eax = address of first free byte in block
cmpl %eax, %edi
ja .Lpad_count # space for count available in block?
# index < 56?
pushl %edx
calll sha256_core
popl %edx
.Lpad_block:
movl %edi, %ecx # ecx = address of last quad word in block
leal block(%edx), %edi # edi = address of block
xorl %eax, %eax # eax = 0
subl %edi, %ecx # ecx = number of bytes before last quad word
# = 56
rep stosb # block[0, 55] = 0,
# edi = address of last quad word in block
.Lpad_count:
movl count(%edx), %ecx
movl count+4(%edx), %eax # eax:ecx = count
shldl $3, %ecx, %eax
shll $3, %ecx # eax:ecx = count * 8
# = number of message bits
bswapl %eax
bswapl %ecx # eax:ecx = htonll(number of message bits)
stosl
movl %ecx, %eax
stosl # block[56, 63] = number of message bits
pushl %edx
calll sha256_core
popl %edx
.Ldigest:
movdqu state(%edx), %xmm1 # xmm1 = {h, g, d, c}
movdqu state+16(%edx), %xmm2 # xmm2 = {f, e, b, a}
movdqa %xmm1, %xmm0 # xmm0 = {h, g, d, c}
punpcklqdq %xmm2, %xmm1 # xmm1 = {h, g, f, e}
punpckhqdq %xmm2, %xmm0 # xmm0 = {d, c, b, a}
movdqa reverse, %xmm2 # xmm2 = shuffle indices
pshufb %xmm2, %xmm1 # xmm1 = {htonl(e), htonl(f), htonl(g), htonl(h)}
pshufb %xmm2, %xmm0 # xmm0 = {htonl(a), htonl(b), htonl(c), htonl(d)}
movl 12(%esp), %edx # edx = address of digest
movdqu %xmm0, (%edx)
movdqu %xmm1, 16(%edx)
popl %edi
retl
.global sha256_final
.size sha256_final, .-sha256_final
.type sha256_final, @function
sha256_init: # void SHA256_Init(SHA256_CTX *context)
movl 4(%esp), %edx # edx = address of context structure
xorl %eax, %eax # eax = 0
movdqa hgdc, %xmm1 # xmm1 = {H7, H6, H3, H2}
movdqa feba, %xmm2 # xmm2 = {H5, H4, H1, H0}
movl %eax, count(%edx)
movl %eax, count+4(%edx) # count = 0
movdqu %xmm1, state(%edx)
movdqu %xmm2, state+16(%edx) # state = {H7, H6, H3, H2, H5, H4, H1, H0}
# = {h, g, d, c, f, e, b, a}
retl
.global sha256_init
.size sha256_init, .-sha256_init
.type sha256_init, @function
sha256_update: # void SHA256_Update(SHA256_CTX *context,
# void const *data,
# unsigned int size)
movl 12(%esp), %eax # eax = number of bytes in data
testl %eax, %eax
jz .Lnone # no data?
movl 4(%esp), %edx # edx = address of context structure
movl count(%edx), %ecx # ecx = low double word of count
andl $63, %ecx # ecx = number of bytes in block
# = index of first free byte in block
addl %eax, count(%edx)
adcl $0, count+4(%edx) # count += number of bytes in data
pushl %esi
movl 12(%esp), %esi # esi = address of data
pushl %edi
.Ldata:
leal block(%edx, %ecx), %edi # edi = address of first free byte in block
xorl $63, %ecx
incl %ecx # ecx = number of free bytes in block
subl %ecx, %eax # eax = number of bytes in data
# - number of free bytes in block
jb .Llast # number of bytes in data < number of free bytes in block?
.Lmore:
rep movsb # esi = address of remaining data,
# ecx = 0 = index of first free byte in block
pushl %edx
calll sha256_core
popl %edx
testl %eax, %eax
jnz .Ldata # more data?
popl %edi
popl %esi
.Lnone:
retl
.Llast:
addl %eax, %ecx # ecx = number of bytes in data
rep movsb
popl %edi
popl %esi
retl
.global sha256_update
.size sha256_update, .-sha256_update
.type sha256_update, @function
.end
#include
the following
ANSI C
header file in your
ANSI C
sources:
// Copyleft © 2004-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>
typedef struct _sha256_ctx { // NOTE: should be 16-byte aligned!
unsigned int state[8], block[16], count[2];
} sha256_ctx;
extern void sha256_core(sha256_ctx *context);
extern void sha256_final(sha256_ctx *context, unsigned char digest[32]);
extern void sha256_init(sha256_ctx *context);
extern void sha256_update(sha256_ctx *context, void const *data, unsigned int size);
; Copyright © 2017-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>
core textequ <SHA256_Core>
final textequ <SHA256_Final>
init textequ <SHA256_Init>
update textequ <SHA256_Update>
context struct 16 ; SHA256_CTX structure
state oword 2 dup (?) ; {h, g, d, c, f, e, b, a}
block oword 4 dup (?)
count qword ?
context ends
sha256 macro r0 :req, r1 :req, r2 :req, r3 :req, s :req
.erre 16 and (opattr &r0) and (opattr &r1) and (opattr &r2) and (opattr &r3)
.erre 4 and (opattr &s)
.errnz 3 and &s or -64 and &s
if &s lt 16
movdqu &r0, context.block[rcx+&s*4]
pshufb &r0, xmm3 ;; &r0 = {ntohl(block[&s*4]), ntohl(block[&s*4+1]),
;; ntohl(block[&s*4+2]), ntohl(block[&s*4+3])}
;; = {w[&s*4], w[&s*4+1], w[&s*4+2], w[&s*4+3]}
else ; &s ge 16
;; &r0 = {w[&s*4-16], w[&s*4-15], w[&s*4-14], w[&s*4-13]},
;; &r1 = {w[&s*4-12], w[&s*4-11], w[&s*4-10], w[&s*4-9]},
;; &r2 = {w[&s*4-8], w[&s*4-7], w[&s*4-6], w[&s*4-5]},
;; &r3 = {w[&s*4-4], w[&s*4-3], w[&s*4-2], w[&s*4-1]}
sha256msg1 &r0, &r1
movdqa xmm0, &r3
palignr xmm0, &r2, 4 ;; xmm0 = {w[&s*4-7], w[&s*4-6], w[&s*4-5], w[&s*4-4]}
paddd &r0, xmm0
sha256msg2 &r0, &r3 ;; &r0 = {w[&s*4], w[&s*4+1], w[&s*4+2], w[&s*4+3]}
endif ; &s ge 16
movdqa xmm0, [rdx-128+&s*4] ;; xmm0 = {k[&s*4], k[&s*4+1], k[&s*4+2], k[&s*4+3]}
paddd xmm0, &r0 ;; xmm0 = {(w+k)[&s*4], (w+k)[&s*4+1], (w+k)[&s*4+2], (w+k)[&s*4+3]}
sha256rnds2 xmm2, xmm1; xmm0 ;; xmm2 = {f', e', b', a'},
;; xmm1 = {h', g', d', c'}
punpckhqdq xmm0, xmm0 ;; xmm0 = {(w+k)[&s*4+2], (w+k)[&s*4+3], (w+k)[&s*4+2], (w+k)[&s*4+3]}
sha256rnds2 xmm1, xmm2; xmm0 ;; xmm1 = {f", e", b", a"},
;; xmm2 = {h", g", d", c"}
endm
.const
; constants for endian conversion
reverse oword 000102030405060708090A0B0C0D0E0Fh
endian label oword
dword 00010203h, 04050607h, 08090A0Bh, 0C0D0E0Fh
hgdc label oword ; SHA-256 start values: H7, H6, H3, H2
dword 05BE0CD19h, 01F83D9ABh, 0A54FF53Ah, 03C6EF372h
feba label oword ; SHA-256 start values: H5, H4, H1, H0
dword 09B05688Ch, 0510E527Fh, 0BB67AE85h, 06A09E667h
k256 label oword ; SHA-256 round constants
dword 0428A2F98h, 071374491h, 0B5C0FBCFh, 0E9B5DBA5h
dword 03956C25Bh, 059F111F1h, 0923F82A4h, 0AB1C5ED5h
dword 0D807AA98h, 012835B01h, 0243185BEh, 0550C7DC3h
dword 072BE5D74h, 080DEB1FEh, 09BDC06A7h, 0C19BF174h
dword 0E49B69C1h, 0EFBE4786h, 00FC19DC6h, 0240CA1CCh
dword 02DE92C6Fh, 04A7484AAh, 05CB0A9DCh, 076F988DAh
dword 0983E5152h, 0A831C66Dh, 0B00327C8h, 0BF597FC7h
dword 0C6E00BF3h, 0D5A79147h, 006CA6351h, 014292967h
dword 027B70A85h, 02E1B2138h, 04D2C6DFCh, 053380D13h
dword 0650A7354h, 0766A0ABBh, 081C2C92Eh, 092722C85h
dword 0A2BFE8A1h, 0A81A664Bh, 0C24B8B70h, 0C76C51A3h
dword 0D192E819h, 0D6990624h, 0F40E3585h, 0106AA070h
dword 019A4C116h, 01E376C08h, 02748774Ch, 034B0BCB5h
dword 0391C0CB3h, 04ED8AA4Ah, 05B9CCA4Fh, 0682E6FF3h
dword 0748F82EEh, 078A5636Fh, 084C87814h, 08CC70208h
dword 090BEFFFAh, 0A4506CEBh, 0BEF9A3F7h, 0C67178F2h
.code
core proc public ; void SHA256_Core(SHA256_CTX *context)
lea rdx, k256+128
movdqa [rsp+8], xmm6
movdqa [rsp+24], xmm7
movdqu xmm1, context.state[rcx+16]
; xmm1 = {f, e, b, a}
movdqu xmm2, context.state[rcx]; xmm2 = {h, g, d, c}
movdqa xmm3, endian
irp t, <0, 16, 32, 48> ; 4*4*4 rounds
sha256 xmm4, xmm5, xmm6, xmm7, %(&t+0)
sha256 xmm5, xmm6, xmm7, xmm4, %(&t+4)
sha256 xmm6, xmm7, xmm4, xmm5, %(&t+8)
sha256 xmm7, xmm4, xmm5, xmm6, %(&t+12)
endm
movdqu xmm7, context.state[rcx+16]
; xmm7 = {f, e, b, a}
movdqu xmm6, context.state[rcx]; xmm6 = {h, g, d, c}
paddd xmm1, xmm7 ; xmm1 = {f', e', b', a'} + {f, e, b, a}
; = {f", e", b", a"}
paddd xmm2, xmm6 ; xmm2 = {h', g', d', c'} + {h, g, d, c}
; = {h", g", d", c"}
movdqu context.state[rcx+16], xmm1
movdqu context.state[rcx], xmm2; state = {h", g", d", c", f", e", b", a"}
movdqa xmm6, [rsp+8]
movdqa xmm7, [rsp+24]
ret
core endp
final proc public ; void SHA256_Final(SHA256_CTX *context,
; unsigned char digest[32])
mov r9, rcx ; r9 = address of context structure
mov rcx, context.count[r9] ; rcx = count
and ecx, sizeof context.block - 1
; ecx = number of bytes in block
; = index of first free byte in block
push rdi
lea rdi, context.block[r9+rcx]
mov r8, rdi ; r8 = address of first free byte in block
pad_1:
mov al, 10000000y
stosb ; block[index] = 0b10000000
pad_0:
xor eax, eax ; rax = 0
xor ecx, sizeof context.block - 1
; ecx = number of free bytes in block - 1
; = 63 - index
rep stosb ; block[index + 1, 63] = 0
sub rdi, sizeof context.count
; rdi = address of last qword in block
cmp r8, rdi
jb short pad_count ; space for count available in block?
; index < 56?
mov rcx, r9 ; rcx = address of context structure
push r9
push r8
push rdx
push rax
sub rsp, 32 ; "home space"
call core
add rsp, 32
pop rax
pop rdx
pop r8
pop r9
pad_block:
mov rcx, rdi ; rcx = address of last qword in block
lea rdi, context.block[r9] ; rdi = address of block
;; xor eax, eax ; rax = 0
sub rcx, rdi ; rcx = number of bytes before last qword
; = 56
rep stosb ; block[0, 55] = 0,
; rdi = address of last qword in block
pad_count:
mov rax, context.count[r9] ; rax = count
shl rax, 3 ; rax = count * 8
; = number of message bits
bswap rax ; rax = htonll(number of message bits)
stosq ; block[56, 63] = number of message bits
mov rcx, r9 ; rcx = address of context structure
push r9
push rdx
sub rsp, 32 ; "home space"
call core
add rsp, 32
pop rdx ; rdx = address of digest
pop r9 ; r9 = address of context structure
pop rdi
movdqa xmm0, reverse
movdqu xmm1, context.state[r9] ; xmm1 = {h, g, d, c}
movdqu xmm2, context.state[r9+16]
; xmm2 = {f, e, b, a}
pshufb xmm1, xmm0 ; xmm1 = {htonl(c), htonl(d), htonl(g), htonl(h)}
pshufb xmm2, xmm0 ; xmm2 = {htonl(a), htonl(b), htonl(e), htonl(f)}
movdqa xmm0, xmm2
punpckhqdq xmm2, xmm1 ; xmm2 = {htonl(e), htonl(f), htonl(g), htonl(h)}
punpcklqdq xmm0, xmm1 ; xmm0 = {htonl(a), htonl(b), htonl(c), htonl(d)}
movdqu [rdx], xmm0
movdqu [rdx+16], xmm2
ret
final endp
init proc public ; void SHA256_Init(SHA256_CTX *context)
xor eax, eax ; rax = 0
movdqa xmm1, hgdc ; xmm1 = {H7, H6, H3, H2}
movdqa xmm2, feba ; xmm2 = {H5, H4, H1, H0}
mov context.count[rcx], rax ; count = 0
movdqu context.state[rcx], xmm1
movdqu context.state[rcx+16], xmm2
; state = {H7, H6, H3, H2, H5, H4, H1, H0}
; = {h, g, d, c, f, e, b, a}
ret
init endp
update proc public ; void SHA256_Update(SHA256_CTX *context,
; void const *data,
; unsigned int size)
test r8, r8
jz short none ; no data?
mov r9, rcx ; r9 = address of context structure
mov rcx, context.count[r9] ; rcx = count
and ecx, sizeof context.block - 1
; ecx = number of bytes in block
; = index of first free byte in block
add context.count[r9], r8 ; count += number of bytes in data
push rsi
mov rsi, rdx ; rsi = address of data
push rdi
data:
lea rdi, context.block[r9+rcx]
; rdi = address of first free byte in block
xor ecx, sizeof context.block - 1
inc ecx ; rcx = number of free bytes in block
sub r8, rcx ; r8 = number of bytes in data
; - number of free bytes in block
jb short last ; number of bytes in data < number of free bytes in block?
more:
rep movsb ; rsi = address of remaining data
mov rdi, r9
mov rcx, r9 ; rcx = address of context structure
push r8
sub rsp, 32 ; "home space"
call core
add rsp, 32
pop r8
mov r9, rdi
xor ecx, ecx ; rcx = 0 = index of first free byte in block
test r8, r8
jnz short data ; more data?
pop rdi
pop rsi
none:
ret
last:
add rcx, r8 ; rcx = number of bytes in data
rep movsb
pop rdi
pop rsi
ret
update endp
end
Note: the function SHA256_Core()
has
163 instructions in 714 bytes plus 272 bytes read-only data.
Microsoft Macro Assembler Reference
Save the AMD64 assembler source presented above as
sha-256.asm
in an arbitrary, preferable empty
directory, then execute the following 2 command lines to generate
the 64-bit object file sha-256.obj
:
SET ML=/c /W3 /X ML64.EXE /DALIGNED sha-256.asmFor details and reference see the MSDN article ML and ML64 Command-Line Reference.
Note: if necessary, see the MSDN article Use the Microsoft C++ toolset from the command line for an introduction.
Note: the command lines can be copied and pasted as block into a Command Processor window.
Microsoft (R) Macro Assembler Version (x64) 14.16.27023.1 Copyright (C) Microsoft Corporation. All rights reserved. Assembling: sha-256.asmSave the following ANSI C header file as
sha-256.h
to #include
it in
your
ANSI C
sources:
// Copyleft © 2004-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>
typedef struct _SHA256_CTX { // NOTE: should be 16-byte aligned!
unsigned int state[8], block[16], count[2];
} SHA256_CTX;
extern void SHA256_Core(SHA256_CTX *context);
extern void SHA256_Final(SHA256_CTX *context, unsigned char digest[32]);
extern void SHA256_Init(SHA256_CTX *context);
extern void SHA256_Update(SHA256_CTX *context, void const *data, unsigned int size);
// Copyright © 2004-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>
#define STRICT
#define UNICODE
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
__declspec(align(16))
typedef struct _SHA256_CTX
{
DWORD State[8];
DWORD Block[16];
DWORD64 Count;
} SHA256_CTX;
VOID CDECL SHA256_Core(SHA256_CTX *Context);
VOID CDECL SHA256_Final(SHA256_CTX *Context, BYTE Digest[32]);
VOID CDECL SHA256_Init(SHA256_CTX *Context);
VOID CDECL SHA256_Update(SHA256_CTX *Context, LPCVOID Data, DWORD Size);
#ifndef _M_IX86
#define __edivmodu(N, D) (DWORD) ((N) / (D)), (DWORD) ((N) % (D))
#else
__forceinline // companion for __emulu()
struct
{
DWORD ulQuotient, ulRemainder;
} CDECL __edivmodu(DWORD64 ullDividend, DWORD ulDivisor)
{
__asm mov eax, dword ptr ullDividend
__asm mov edx, dword ptr ullDividend+4
__asm div ulDivisor
}
#endif // _M_IX86
__declspec(safebuffers)
BOOL CDECL PrintConsole(HANDLE hConsole, [SA_FormatString(Style="printf")] LPCWSTR lpFormat, ...)
{
WCHAR szOutput[1024];
DWORD dwOutput;
DWORD dwConsole;
va_list vaInput;
va_start(vaInput, lpFormat);
dwOutput = wvsprintf(szOutput, lpFormat, vaInput);
va_end(vaInput);
if ((dwOutput == 0)
|| !WriteConsole(hConsole, szOutput, dwOutput, &dwConsole, NULL))
return FALSE;
return dwConsole == dwOutput;
}
__declspec(noreturn)
VOID CDECL wmainCRTStartup(VOID)
{
SHA256_CTX Context;
BYTE cbDigest[32], cbMillion[1000000];
DWORD dwCPUID[16];
DWORD dwError = ERROR_SUCCESS;
DWORD dwThread = 1000000000 / 64;
DWORD64 qwThread[2];
HANDLE hThread = GetCurrentThread();
HANDLE hConsole = GetStdHandle(STD_ERROR_HANDLE);
if (hConsole == INVALID_HANDLE_VALUE)
dwError = GetLastError();
else
{
__cpuid(dwCPUID, 0x80000000);
if (*dwCPUID >= 0x80000004)
{
__cpuid(dwCPUID + 4, 0x80000002);
__cpuid(dwCPUID + 8, 0x80000003);
__cpuid(dwCPUID + 12, 0x80000004);
}
else
__movsb(dwCPUID, "unidentified processor", sizeof("unidentified processor"));
__cpuid(dwCPUID, 7);
if (dwCPUID[1] & (1 << 29) == 0)
PrintConsole(hConsole, L"SHA-NI instructions not supported on %.48hs!\n", dwCPUID + 4);
else
{
if (SetThreadIdealProcessor(hThread, 0) == -1)
PrintConsole(hConsole,
L"SetThreadIdealProcessor() returned error %lu\n",
dwError = GetLastError());
if (!SetThreadPriority(hThread, THREAD_PRIORITY_HIGHEST))
PrintConsole(hConsole,
L"SetThreadPriority() returned error %lu\n",
dwError = GetLastError());
PrintConsole(hConsole, L"\nTesting SHA-256 implementation...\n");
SHA256_Init(&Context);
SHA256_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"\"\n"
L"\te3b0c442 98fc1c14 9afbf4c8 996fb924 27ae41e4 649b934c a495991b 7852b855\n"
L"\t%08lX %08lX %08lX %08lX %08lX %08lX %08lX %08lX\n",
Context.State[7], Context.State[6], Context.State[3], Context.State[2],
Context.State[5], Context.State[4], Context.State[1], Context.State[0]);
SHA256_Init(&Context);
SHA256_Update(&Context, "abc", 3);
SHA256_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"abc\"\n"
L"\tba7816bf 8f01cfea 414140de 5dae2223 b00361a3 96177a9c b410ff61 f20015ad\n"
L"\t%08lX %08lX %08lX %08lX %08lX %08lX %08lX %08lX\n",
Context.State[7], Context.State[6], Context.State[3], Context.State[2],
Context.State[5], Context.State[4], Context.State[1], Context.State[0]);
SHA256_Init(&Context);
SHA256_Update(&Context, "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq",
sizeof("abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq") - 1);
SHA256_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq\"\n"
L"\t248d6a61 d20638b8 e5c02693 0c3e6039 a33ce459 64ff2167 f6ecedd4 19db06c1\n"
L"\t%08lX %08lX %08lX %08lX %08lX %08lX %08lX %08lX\n",
Context.State[7], Context.State[6], Context.State[3], Context.State[2],
Context.State[5], Context.State[4], Context.State[1], Context.State[0]);
SHA256_Init(&Context);
SHA256_Update(&Context, "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu",
sizeof("abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu") - 1);
SHA256_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu\"\n"
L"\tcf5b16a7 78af8380 036ce59e 7b049237 0b249b11 e8f07a51 afac4503 7afee9d1\n"
L"\t%08lX %08lX %08lX %08lX %08lX %08lX %08lX %08lX\n",
Context.State[7], Context.State[6], Context.State[3], Context.State[2],
Context.State[5], Context.State[4], Context.State[1], Context.State[0]);
SHA256_Init(&Context);
SHA256_Update(&Context, "\xBD", 1);
SHA256_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"\\xBD\"\n"
L"\t68325720 aabd7c82 f30f554b 313d0570 c95accbb 7dc4b5aa e11204c0 8ffe732b\n"
L"\t%08lX %08lX %08lX %08lX %08lX %08lX %08lX %08lX\n",
Context.State[7], Context.State[6], Context.State[3], Context.State[2],
Context.State[5], Context.State[4], Context.State[1], Context.State[0]);
SHA256_Init(&Context);
SHA256_Update(&Context, "\xC9\x8C\x8E\x55", 4);
SHA256_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"\\xC9\\x8C\\x8E\\x55\"\n"
L"\t7abc22c0 ae5af26c e93dbb94 433a0e0b 2e119d01 4f8e7f65 bd56c61c cccd9504\n"
L"\t%08lX %08lX %08lX %08lX %08lX %08lX %08lX %08lX\n",
Context.State[7], Context.State[6], Context.State[3], Context.State[2],
Context.State[5], Context.State[4], Context.State[1], Context.State[0]);
__stosb(cbMillion, 0, 1000);
SHA256_Init(&Context);
SHA256_Update(&Context, cbMillion, 55);
SHA256_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\'\\0\'×55\n"
L"\t02779466 cdec1638 11d07881 5c633f21 90141308 1449002f 24aa3e80 f0b88ef7\n"
L"\t%08lX %08lX %08lX %08lX %08lX %08lX %08lX %08lX\n",
Context.State[7], Context.State[6], Context.State[3], Context.State[2],
Context.State[5], Context.State[4], Context.State[1], Context.State[0]);
SHA256_Init(&Context);
SHA256_Update(&Context, cbMillion, 56);
SHA256_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\'\\0\'×56\n"
L"\td4817aa5 497628e7 c77e6b60 6107042b bba31308 88c5f47a 375e6179 be789fbb\n"
L"\t%08lX %08lX %08lX %08lX %08lX %08lX %08lX %08lX\n",
Context.State[7], Context.State[6], Context.State[3], Context.State[2],
Context.State[5], Context.State[4], Context.State[1], Context.State[0]);
SHA256_Init(&Context);
SHA256_Update(&Context, cbMillion, 57);
SHA256_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\'\\0\'×57\n"
L"\t65a16cb7 861335d5 ace3c607 18b5052e 44660726 da4cd13b b745381b 235a1785\n"
L"\t%08lX %08lX %08lX %08lX %08lX %08lX %08lX %08lX\n",
Context.State[7], Context.State[6], Context.State[3], Context.State[2],
Context.State[5], Context.State[4], Context.State[1], Context.State[0]);
SHA256_Init(&Context);
SHA256_Update(&Context, cbMillion, 64);
SHA256_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\'\\0\'×64\n"
L"\tf5a5fd42 d16a2030 2798ef6e d309979b 43003d23 20d9f0e8 ea9831a9 2759fb4b\n"
L"\t%08lX %08lX %08lX %08lX %08lX %08lX %08lX %08lX\n",
Context.State[7], Context.State[6], Context.State[3], Context.State[2],
Context.State[5], Context.State[4], Context.State[1], Context.State[0]);
SHA256_Init(&Context);
SHA256_Update(&Context, cbMillion, 1000);
SHA256_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\'\\0\'×1000\n"
L"\t541b3e9d aa09b20b f85fa273 e5cbd3e8 0185aa4e c298e765 db87742b 70138a53\n"
L"\t%08lX %08lX %08lX %08lX %08lX %08lX %08lX %08lX\n",
Context.State[7], Context.State[6], Context.State[3], Context.State[2],
Context.State[5], Context.State[4], Context.State[1], Context.State[0]);
__stosb(cbMillion, 'A', 1000);
SHA256_Init(&Context);
SHA256_Update(&Context, cbMillion, 1000);
SHA256_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"A…A\"\n"
L"\tc2e68682 3489ced2 017f6059 b8b23931 8b6364f6 dcd835d0 a519105a 1eadd6e4\n"
L"\t%08lX %08lX %08lX %08lX %08lX %08lX %08lX %08lX\n",
Context.State[7], Context.State[6], Context.State[3], Context.State[2],
Context.State[5], Context.State[4], Context.State[1], Context.State[0]);
__stosb(cbMillion, 'U', 1005);
SHA256_Init(&Context);
SHA256_Update(&Context, cbMillion, 1005);
SHA256_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"U…U\"\n"
L"\tf4d62dde c0f3dd90 ea1380fa 16a5ff8d c4c54b21 740650f2 4afc4120 903552b0\n"
L"\t%08lX %08lX %08lX %08lX %08lX %08lX %08lX %08lX\n",
Context.State[7], Context.State[6], Context.State[3], Context.State[2],
Context.State[5], Context.State[4], Context.State[1], Context.State[0]);
__stosb(cbMillion, 'a', 1000000);
SHA256_Init(&Context);
SHA256_Update(&Context, cbMillion, 1000000);
SHA256_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"a…a\"\n"
L"\tcdc76e5c 9914fb92 81a1c7e2 84d73e67 f1809a48 a497200e 046d39cc c7112cd0\n"
L"\t%08lX %08lX %08lX %08lX %08lX %08lX %08lX %08lX\n",
Context.State[7], Context.State[6], Context.State[3], Context.State[2],
Context.State[5], Context.State[4], Context.State[1], Context.State[0]);
__stosb(cbMillion, 0, 1000000);
SHA256_Init(&Context);
SHA256_Update(&Context, cbMillion, 1000000);
SHA256_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\'\\0\'×1000000\n"
L"\td29751f2 649b32ff 572b5e0a 9f541ea6 60a50f94 ff0beedf b0b692b9 24cc8025\n"
L"\t%08lX %08lX %08lX %08lX %08lX %08lX %08lX %08lX\n",
Context.State[7], Context.State[6], Context.State[3], Context.State[2],
Context.State[5], Context.State[4], Context.State[1], Context.State[0]);
PrintConsole(hConsole, L"\nTiming SHA-256 on %.48hs:\n", dwCPUID + 4);
#ifdef CYCLES
if (!QueryThreadCycleTime(hThread, qwThread))
PrintConsole(hConsole,
L"QueryThreadCycleTime() returned error %lu\n",
dwError = GetLastError());
else
{
do
SHA256_Core(&Context);
while (--dwThread);
if (!QueryThreadCycleTime(hThread, qwThread + 1))
PrintConsole(hConsole,
L"QueryThreadCycleTime() returned error %lu\n",
dwError = GetLastError());
else
PrintConsole(hConsole,
L"%lu.%09lu clock cycles per byte\n",
__edivmodu(qwThread[1] - qwThread[0], 1000000000));
}
#else
if (!GetThreadTimes(hThread, (LPFILETIME) qwThread + 1, (LPFILETIME) qwThread + 1, (LPFILETIME) qwThread + 1, (LPFILETIME) qwThread))
PrintConsole(hConsole,
L"QueryThreadTimes() returned error %lu\n",
dwError = GetLastError());
else
{
do
SHA256_Core(&Context);
while (--dwThread);
if (!GetThreadTimes(hThread, (LPFILETIME) qwThread, (LPFILETIME) qwThread, (LPFILETIME) qwThread, (LPFILETIME) qwThread + 1))
PrintConsole(hConsole,
L"QueryThreadTimes() returned error %lu\n",
dwError = GetLastError());
else
PrintConsole(hConsole,
L"%lu.%07lu nano-seconds per byte\n",
__edivmodu(qwThread[1] - qwThread[0], 10000000));
}
#endif // CYCLES
}
}
ExitProcess(dwError);
}
Save the
ANSI C
source presented above as sha-256.c
next to the object
file sha-256.obj
assembled before, then run the
following 4 command lines to build the 64-bit console application
sha-256.exe
and execute it:
SET CL=/GAFS- /Gs1049600 /Oxy /W4 /Zl SET LINK=/ENTRY:wmainCRTStartup /NODEFAULTLIB /STACK:2097152,1048576 /SUBSYSTEM:CONSOLE CL.EXE /DCYCLES /Fosha-256.tmp sha-256.c sha-256.obj kernel32.lib user32.lib .\sha-256.exeFor details and reference see the MSDN articles Compiler Options and Linker Options.
Note: if necessary, see the MSDN article Use the Microsoft C++ toolset from the command line for an introduction.
Note: the command lines can be copied and pasted as block into a Command Processor window.
Microsoft (R) C/C++ Optimizing Compiler Version 16.00.40219.01 for x64 Copyright (C) Microsoft Corporation. All rights reserved. sha-256.c Microsoft (R) Incremental Linker Version 10.00.40219.386 Copyright (C) Microsoft Corporation. All rights reserved. /ENTRY:mainCRTStartup /NODEFAULTLIB /STACK:2097152,1048576 /SUBSYSTEM:CONSOLE /out:sha-256.exe sha-256.tmp sha-256.obj kernel32.lib user32.lib Testing SHA-256 implementation... "" e3b0c442 98fc1c14 9afbf4c8 996fb924 27ae41e4 649b934c a495991b 7852b855 E3B0C442 98FC1C14 9AFBF4C8 996FB924 27AE41E4 649B934C A495991B 7852B855 "abc" ba7816bf 8f01cfea 414140de 5dae2223 b00361a3 96177a9c b410ff61 f20015ad BA7816BF 8F01CFEA 414140DE 5DAE2223 B00361A3 96177A9C B410FF61 F20015AD "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq" 248d6a61 d20638b8 e5c02693 0c3e6039 a33ce459 64ff2167 f6ecedd4 19db06c1 248D6A61 D20638B8 E5C02693 0C3E6039 A33CE459 64FF2167 F6ECEDD4 19DB06C1 "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu" cf5b16a7 78af8380 036ce59e 7b049237 0b249b11 e8f07a51 afac4503 7afee9d1 CF5B16A7 78AF8380 036CE59E 7B049237 0B249B11 E8F07A51 AFAC4503 7AFEE9D1 "\xBD" 68325720 aabd7c82 f30f554b 313d0570 c95accbb 7dc4b5aa e11204c0 8ffe732b 68325720 AABD7C82 F30F554B 313D0570 C95ACCBB 7DC4B5AA E11204C0 8FFE732B "\xC9\x8C\x8E\x55" 7abc22c0 ae5af26c e93dbb94 433a0e0b 2e119d01 4f8e7f65 bd56c61c cccd9504 7ABC22C0 AE5AF26C E93DBB94 433A0E0B 2E119D01 4F8E7F65 BD56C61C CCCD9504 '\0'×55 02779466 cdec1638 11d07881 5c633f21 90141308 1449002f 24aa3e80 f0b88ef7 02779466 CDEC1638 11D07881 5C633F21 90141308 1449002F 24AA3E80 F0B88EF7 '\0'×56 d4817aa5 497628e7 c77e6b60 6107042b bba31308 88c5f47a 375e6179 be789fbb D4817AA5 497628E7 C77E6B60 6107042B BBA31308 88C5F47A 375E6179 BE789FBB '\0'×57 65a16cb7 861335d5 ace3c607 18b5052e 44660726 da4cd13b b745381b 235a1785 65A16CB7 861335D5 ACE3C607 18B5052E 44660726 DA4CD13B B745381B 235A1785 '\0'×64 f5a5fd42 d16a2030 2798ef6e d309979b 43003d23 20d9f0e8 ea9831a9 2759fb4b F5A5FD42 D16A2030 2798EF6E D309979B 43003D23 20D9F0E8 EA9831A9 2759FB4B '\0'×1000 541b3e9d aa09b20b f85fa273 e5cbd3e8 0185aa4e c298e765 db87742b 70138a53 541B3E9D AA09B20B F85FA273 E5CBD3E8 0185AA4E C298E765 DB87742B 70138A53 "A…A" c2e68682 3489ced2 017f6059 b8b23931 8b6364f6 dcd835d0 a519105a 1eadd6e4 C2E68682 3489CED2 017F6059 B8B23931 8B6364F6 DCD835D0 A519105A 1EADD6E4 "U…U" f4d62dde c0f3dd90 ea1380fa 16a5ff8d c4c54b21 740650f2 4afc4120 903552b0 F4D62DDE C0F3DD90 EA1380FA 16A5FF8D C4C54B21 740650F2 4AFC4120 903552B0 "a…a" cdc76e5c 9914fb92 81a1c7e2 84d73e67 f1809a48 a497200e 046d39cc c7112cd0 CDC76E5C 9914FB92 81A1C7E2 84D73E67 F1809A48 A497200E 046D39CC C7112CD0 '\0'×1000000 d29751f2 649b32ff 572b5e0a 9f541ea6 60a50f94 ff0beedf b0b692b9 24cc8025 D29751F2 649B32FF 572B5E0A 9F541EA6 60A50F94 FF0BEEDF B0B692B9 24CC8025 Timing SHA-256 on AMD Ryzen 7 5700X 8-Core Processor : 1.578519190 clock cycles per byteAt the nominal clock frequency of 3.4 GHz this is more than 2 GB per second.
Build the console application sha-256.exe
a second time,
now without the preprocessor macro CYCLES
defined, and
execute it:
CL.EXE /Fosha-256.tmp sha-256.c sha-256.obj .\sha-256.exe
[…] Timing SHA-256 on AMD Ryzen 7 5700X 8-Core Processor : 0.4687500 nano-seconds per byteIn other units: 2.133 GB per second, 6.8 times the throughput of the 32-bit assembly code using general purpose registers shown above!
Note: the quotient clock cycles ⁄ nano-seconds gives an effective clock frequency of 3.368 GHz, matching the nominal clock frequency within a 1% margin.
Execution of both console applications on several Intel processors which support the SHA Extensions yields the following results:
[…] Timing SHA-256 on 11th Gen Intel(R) Core(TM) i5-1135G7 @ 2.40GHz: 1.445777621 clock cycles per byte […] Timing SHA-256 on 11th Gen Intel(R) Core(TM) i5-1135G7 @ 2.40GHz: 0.3906250 nano-seconds per byte8.8 times faster than the 32-bit assembly code using general purpose registers shown above!
Note: on this processor, the quotient clock cycles ⁄ nano-seconds gives an effective clock frequency of 3.701 GHz!
[…] Timing SHA-256 on 11th Gen Intel(R) Core(TM) i5-11400T @ 1.30GHz: 1.002475398 clock cycles per byte […] Timing SHA-256 on 11th Gen Intel(R) Core(TM) i5-11400T @ 1.30GHz: 0.7812500 nano-seconds per byte6.5 times faster than the 32-bit assembly code using general purpose registers shown above!
Note: on this processor, the quotient clock cycles ⁄ nano-seconds gives an effective clock frequency of 1.283 GHz, matching its nominal clock frequency within a 1.3% margin.
[…] Timing SHA-256 on 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz: 1.582532317 clock cycles per byte […] Timing SHA-256 on 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz: 0.5625000 nano-seconds per byte5.8 times faster than the 32-bit assembly code using general purpose registers shown above!
Note: the quotient clock cycles ⁄ nano-seconds gives an effective clock frequency of 2.813 GHz, matching its nominal clock frequency within a 0.5% margin.
[…] Timing SHA-256 on 11th Gen Intel(R) Core(TM) i7-11700 @ 2.50GHz: 1.337258862 clock cycles per byte […] Timing SHA-256 on 11th Gen Intel(R) Core(TM) i7-11700 @ 2.50GHz: 0.5000000 nano-seconds per byte5.9 times faster than the 32-bit assembly code using general purpose registers shown above!
Note: the quotient clock cycles ⁄ nano-seconds gives an effective clock frequency of 2.675 GHz, 7% above the nominal clock frequency.
[…] Timing SHA-256 on 11th Gen Intel(R) Core(TM) i7-11700K @ 3.60GHz: 1.868331132 clock cycles per byte […] Timing SHA-256 on 11th Gen Intel(R) Core(TM) i7-11700K @ 3.60GHz: 0.5625000 nano-seconds per byte5.8 times faster than the 32-bit assembly code using general purpose registers shown above!
Note: the quotient clock cycles ⁄ nano-seconds gives an effective clock frequency of 3.321 GHz, 7.7% below the nominal clock frequency.
[…] Timing SHA-256 on 12th Gen Intel(R) Core(TM) i5-12400F: 1.257839522 clock cycles per byte […] Timing SHA-256 on 12th Gen Intel(R) Core(TM) i5-12400F: 0.5156250 nano-seconds per byte4.2 times faster than the 32-bit assembly code using general purpose registers shown above!
Note: on this processor, the quotient clock cycles ⁄ nano-seconds gives an effective clock frequency of 2.439 GHz, 2.4% below its nominal clock frequency of 2.5 GHz.
[…] Timing SHA-256 on 12th Gen Intel(R) Core(TM) i7-12700K: 1.515160836 clock cycles per byte […] Timing SHA-256 on 12th Gen Intel(R) Core(TM) i7-12700K: 0.4218750 nano-seconds per byte4.2 times faster than the 32-bit assembly code using general purpose registers shown above!
Note: on this processor, the quotient clock cycles ⁄ nano-seconds gives an effective clock frequency of 3.591 GHz, matching its nominal clock frequency of 3.6 GHz.
[…]
Timing SHA-256 on 13th Gen Intel(R) Core(TM) i5-1335U:
2.170230435 clock cycles per byte
[…]
Timing SHA-256 on 13th Gen Intel(R) Core(TM) i5-1335U:
0.0000000 nano-seconds per byte
4.6 times faster than the 32-bit assembly code using general purpose
registers shown above!
OUCH: on this processor with unknown nominal clock frequency, the execution time measurement yields an obviously wrong value!
VSHA512MSG1
,
VSHA512MSG2
and
VSHA512RNDS2
InstructionsVSHA512MSG1–Perform an Intermediate Calculation for the Next Four SHA512 Message QwordsOops:
Opcode/Instruction Op/En 64/32 bit Mode Support CPUID Feature Flag Description VEX.256.F2.0F38.W0 CC 11:rrr:bbb
VSHA512MSG1 ymm1, xmm2A V/V AVX
SHA512Performs an intermediate calculation for the next four SHA512 message qwords using previous message qwords from ymm1 and xmm2, storing the result in ymm1.
Instruction Operand Encoding Op/En Tuple Operand 1 Operand 2 Operand 3 Operand 4 A N/A ModRM:reg (r, w) ModRM:r/m (r) N/A N/A Description
The VSHA512MSG1 instruction is one of two SHA512 message scheduling instructions. The instruction performs an intermediate calculation for the next four SHA512 message qwords.
See https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf for more information on the SHA512 standard.
Operation
define ROR64(qword, n):
count := n % 64
dest := (qword >> count) | (qword << (64-count))
return dest
define SHR64(qword, n):
return qword >> n
define s0(qword):
return ROR64(qword,1) ^ ROR64(qword, 8) ^ SHR64(qword, 7)
VSHA512MSG1 SRCDEST, SRC1
W[4] := SRC1.qword[0]
W[3] := SRCDEST.qword[3]
W[2] := SRCDEST.qword[2]
W[1] := SRCDEST.qword[1]
W[0] := SRCDEST.qword[0]
SRCDEST.qword[3] := W[3] + s0(W[4])
SRCDEST.qword[2] := W[2] + s0(W[3])
SRCDEST.qword[1] := W[1] + s0(W[2])
SRCDEST.qword[0] := W[0] + s0(W[1])
AVX
alias
Sandy Bridge New Instructions don’t support
256-bit wide YMM
registers; the highlighted
CPUID feature
flag is therefore wrong and should be AVX2
(alias
Haswell New Instructions) instead!
VSHA512MSG2–Perform a Final Calculation for the Next Four SHA512 Message QwordsOops:
Opcode/Instruction Op/En 64/32 bit Mode Support CPUID Feature Flag Description VEX.256.F2.0F38.W0 CD 11:rrr:bbb
VSHA512MSG2 ymm1, ymm2A V/V AVX
SHA512Performs the final calculation for the next four SHA512 message qwords using previous message qwords from ymm1 and ymm2, storing the result in ymm1.
Instruction Operand Encoding Op/En Tuple Operand 1 Operand 2 Operand 3 Operand 4 A N/A ModRM:reg (r, w) ModRM:r/m (r) N/A N/A Description
The VSHA512MSG2 instruction is one of two SHA2 message scheduling instructions. The instruction performs the final calculation for the next four SHA512 message qwords.
See https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf for more information on the SHA512 standard.
Operation
define ROR64(qword, n):
count := n % 64
dest := (qword >> count) | (qword << (64-count))
return dest
define SHR64(qword, n):
return qword >> n
define s1(qword):
return ROR64(qword,19) ^ ROR64(qword, 61) ^ SHR64(qword, 6)
VSHA512MSG2 SRCDEST, SRC1
W[14] := SRC1.qword[2]
W[15] := SRC1.qword[3]
W[16] := SRCDEST.qword[0] + s1(W[14])
W[17] := SRCDEST.qword[1] + s1(W[15])
W[18] := SRCDEST.qword[2] + s1(W[16])
W[19] := SRCDEST.qword[3] + s1(W[17])
SRCDEST.qword[3] := W[19]
SRCDEST.qword[2] := W[18]
SRCDEST.qword[1] := W[17]
SRCDEST.qword[0] := W[16]
AVX
alias
Sandy Bridge New Instructions don’t support
256-bit wide YMM
registers; the highlighted
CPUID feature
flag is therefore wrong and should be AVX2
(alias
Haswell New Instructions) instead!
VSHA512RNDS2–Perform Two Rounds of SHA512 OperationOops:
Opcode/Instruction Op/En 64/32 bit Mode Support CPUID Feature Flag Description VEX.256.F2.0F38.W0 CB 11:rrr:bbb
VSHA512RNDS2 ymm1, ymm2, xmm3A V/V AVX
SHA512Perform 2 rounds of SHA512 operation using an initial SHA512 state (C,D,G,H) from ymm1, an initial SHA512 state (A,B,E,F) from ymm2, and a pre-computed sum of the next 2 round message qwords and the corresponding round constants from xmm3, storing the updated SHA512 state (A,B,E,F) result in ymm1.
Instruction Operand Encoding Op/En Tuple Operand 1 Operand 2 Operand 3 Operand 4 A N/A ModRM:reg (r, w) VEX.vvvv (r) ModRM:r/m (r) N/A Description
The VSHA512RNDS2 instruction performs 2 rounds of SHA512 operation using an initial SHA512 state (C,D,G,H) from the first operand, an initial SHA512 state (A,B,E,F) from the second operand, and a pre-computed sum of the next 2 round message qwords and the corresponding round constants from the third operand (only the two lower qwords of the third operand). The updated SHA512 state (A,B,E,F) is written to the first operand, and the second operand can be used as the updated state (C,D,G,H) in later rounds.
See https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf for more information on the SHA512 standard.
Operation
define ROR64(qword, n):
count := n % 64
dest := (qword >> count) | (qword << (64-count))
return dest
define SHR64(qword, n):
return qword >> n
define cap_sigma0(qword):
return ROR64(qword,28) ^ ROR64(qword, 34) ^ ROR64(qword, 39)
define cap_sigma1(qword):
return ROR64(qword,14) ^ ROR64(qword, 18) ^ ROR64(qword, 41)
define MAJ(a,b,c):
return (a & b) ^ (a & c) ^ (b & c)
define CH(e,f,g):
return (e & f) ^ (g & ~e)
VSHA512RNDS2 SRCDEST, SRC1, SRC2
A[0] := SRC1.qword[3]
B[0] := SRC1.qword[2]
C[0] := SRCDEST.qword[3]
D[0] := SRCDEST.qword[2]
E[0] := SRC1.qword[1]
F[0] := SRC1.qword[0]
G[0] := SRCDEST.qword[1]
H[0] := SRCDEST.qword[0]
WK[0]:= SRC2.qword[0]
WK[1]:= SRC2.qword[1]
FOR i in 0..1:
A[i+1] := CH(E[i], F[i], G[i]) +
cap_sigma1(E[i]) + WK[i] + H[i] +
MAJ(A[i], B[i], C[i]) +
cap_sigma0(A[i])
B[i+1] := A[i]
C[i+1] := B[i]
D[i+1] := C[i]
E[i+1] := CH(E[i], F[i], G[i]) +
cap_sigma1(E[i]) + WK[i] + H[i] + D[i]
F[i+1] := E[i]
G[i+1] := F[i]
H[i+1] := G[i]
SRCDEST.qword[3] = A[2]
SRCDEST.qword[2] = B[2]
SRCDEST.qword[1] = E[2]
SRCDEST.qword[0] = F[2]
AVX
alias
Sandy Bridge New Instructions don’t support
256-bit wide YMM
registers; the highlighted
CPUID feature
flag is therefore wrong and should be AVX2
(alias
Haswell New Instructions) instead!
SHA512
and AVX2
Instructions for GNU as# Copyright © 2017-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>
.ident "Copyright (C) 2017-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>"
.file "sha-ni-512.s"
.arch znver1
.code32
.att_syntax
.struct 0 # SHA512_CTX structure
state: # 8 quad words: {h, g, d, c, f, e, b, a}
.space 64
block: # 16 quad words
.space 128
count: # 2 quad words
.space 16
.section .const, "ar"
.align 64
reverse: # constants for endian conversion
.octa 0x00102030405060708090A0B0C0D0E0F
.octa 0x00102030405060708090A0B0C0D0E0F
endian:
.quad 0x0001020304050607, 0x08090A0B0C0D0E0F
.quad 0x0001020304050607, 0x08090A0B0C0D0E0F
hgdc: # SHA-512 start values: H7, H6, H3, H2
.quad 0x5BE0CD19137E2179, 0x1F83D9ABFB41BD6B, 0xA54FF53A5F1D36F1, 0x3C6EF372FE94F82B
feba: # SHA-512 start values: H5, H4, H1, H0
.quad 0x9B05688C2B3E6C1F, 0x510E527FADE682D1, 0xBB67AE8584CAA73B, 0x6A09E667F3BCC908
k512: # SHA-512 round constants
.quad 0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC
.quad 0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118
.quad 0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2
.quad 0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694
.quad 0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65
.quad 0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5
.quad 0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4
.quad 0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70
.quad 0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF
.quad 0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B
.quad 0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30
.quad 0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8
.quad 0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8
.quad 0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3
.quad 0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC
.quad 0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B
.quad 0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178
.quad 0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B
.quad 0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C
.quad 0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
.size k512, .-k512
.type k512, @object
.macro sha512 m0, m1, m2, m3 # 2*2 rounds
.if \@ < 4
vmovdqu block+16*\@(%edx), \m0
vpshufb %ymm3, \m0, \m0 # \m0 = {ntohll(block[\@*16]), ntohll(block[\@*16+1]),
# ntohll(block[\@*16+2]), ntohll(block[\@*16+3])}
# = {w[\@*16], w[\@*16+1], w[\@*16+2], w[\@*16+3]}
.else
# \m0 = {w[\@*16-16], w[\@*16-15], w[\@*16-14], w[\@*16-13]},
# \m1 = {w[\@*16-12], w[\@*16-11], w[\@*16-10], w[\@*16-9]},
# \m2 = {w[\@*16-8], w[\@*16-7], w[\@*16-6], w[\@*16-5]},
# \m3 = {w[\@*16-4], w[\@*16-3], w[\@*16-2], w[\@*16-1]}
vsha512msg1 \m1, \m0
vpblendd $3, \m3, \m2, %ymm0 # ymm0 = {w[\@*16-4], w[\@*16-7], w[\@*16-6], w[\@*16-5]}
vpermq $57, %ymm0, %ymm0 # ymm0 = {w[\@*16-7], w[\@*16-6], w[\@*16-5], w[\@*16-4]}
vpaddq %ymm0, \m0, \m0
vsha512msg2 \m3, \m0 # \m0 = {w[\@*16], w[\@*16+1], w[\@*16+2], w[\@*16+3]}
.endif
vpaddq \@%4*32(%ecx), \m0, %ymm0
# ymm0 = {(w+k)[\@*16], (w+k)[\@*16+1], (w+k)[\@*16+2], (w+k)[\@*16+3]}
vsha512rnds2 %ymm0, %ymm1, %ymm2
# ymm2 = {f', e', b', a'},
# ymm1 = {h', g', d', c'}
.if 1
vperm2i128 $129, %ymm0, %ymm0, %ymm0
# ymm0 = {(w+k)[\@*16+2], (w+k)[\@*16+3], 0, 0}
.elseif 1
vextracti128 $1, %ymm0, %xmm0 # ymm0 = {(w+k)[\@*16+2], (w+k)[\@*16+3], 0, 0}
.else
vpermq $78, %ymm0, %ymm0 # ymm0 = {(w+k)[\@*16+2], (w+k)[\@*16+3], (w+k)[\@*16+2], (w+k)[\@*16+3]}
.endif
vsha512rnds2 %ymm0, %ymm2, %ymm1
# ymm1 = {f", e", b", a"},
# ymm2 = {h", g", d", c"}
.endm
.text
sha512_core: # void SHA512_Core(SHA512_CTX *context)
movl 4(%esp), %edx # edx = address of context structure
vmovdqu state+32(%edx), %ymm1 # ymm1 = {f, e, b, a}
vmovdqu state(%edx), %ymm2 # ymm2 = {h, g, d, c}
vmovdqa endian, %ymm3 # ymm3 = constant for endian conversion
leal k512, %ecx
.rept 5 # 5*4*4 rounds
sha512 %ymm4, %ymm5, %ymm6, %ymm7
sha512 %ymm5, %ymm6, %ymm7, %ymm4
sha512 %ymm6, %ymm7, %ymm4, %ymm5
sha512 %ymm7, %ymm4, %ymm5, %ymm6
subl $-128, %ecx
.endr
vmovdqu state+32(%edx), %ymm7 # ymm7 = {f, e, b, a}
vmovdqu state(%edx), %ymm6 # ymm6 = {h, g, d, c}
vpaddq %ymm7, %ymm1, %ymm1 # ymm1 = {f', e', b', a'} + {f, e, b, a}
# = {f", e", b", a"}
vpaddq %ymm6, %ymm2, %ymm2 # ymm2 = {h', g', d', c'} + {h, g, d, c}
# = {h", g", d", c"}
vmovdqu %ymm1, state+32(%edx)
vmovdqu %ymm2, state(%edx) # state = {h", g", d", c", f", e", b", a"}
retl
.global sha512_core
.size sha512_core, .-sha512_core
.type sha512_core, @function
sha512_final: # void SHA512_Final(SHA512_CTX *context,
# unsigned char digest[64])
movl 4(%esp), %edx # edx = address of context structure
movl count(%edx), %ecx # ecx = low double word of count
andl $127, %ecx # ecx = number of bytes in block
# = index of first free byte in block
pushl %edi
leal block(%edx, %ecx), %edi # edi = address of first free byte in block
pushl %edi
.Lpad_1:
movb $0b10000000, %al
stosb # block[index] = 0b10000000
.Lpad_0:
xorl %eax, %eax # eax = 0
xorl $127, %ecx # ecx = number of free bytes in block - 1
# = 127 - index
rep stosb # block[index + 1, 127] = 0
subl $16, %edi # edi = address of last octa word in block
popl %eax # eax = address of first free byte in block
cmpl %eax, %edi
ja .Lpad_count # space for count available in block?
# index < 112?
pushl %edx
calll sha512_core
popl %edx
.Lpad_block:
movl %edi, %ecx # ecx = address of last octa word in block
leal block(%edx), %edi # edi = address of block
xorl %eax, %eax # eax = 0
subl %edi, %ecx # ecx = number of bytes before last octa word
# = 112
rep stosb # block[0, 111] = 0,
# edi = address of last octa word in block
.Lpad_count:
movl count+12(%edx), %eax
movl count+8(%edx), %ecx # eax:ecx = high quad word of count
shldl $3, %ecx, %eax
bswapl %eax
stosl
movl %ecx, %eax
movl count+4(%edx), %ecx
shldl $3, %ecx, %eax
bswapl %eax
stosl
movl %ecx, %eax
movl count(%edx), %ecx
shldl $3, %ecx, %eax
shll $3, %ecx # eax:ecx = low quad word of count * 8
bswapl %eax
bswapl %ecx # eax:ecx = htonll(low quad word of number of message bits)
stosl
movl %ecx, %eax
stosl # block[112, 127] = number of message bits
pushl %edx
calll sha512_core
popl %edx
.Ldigest:
vmovdqa reverse, %ymm0
vmovdqu state(%edx), %ymm1 # ymm1 = {h, g, d, c}
vmovdqu state+32(%edx), %ymm2 # ymm2 = {f, e, b, a}
vpshufb %ymm0, %ymm1, %ymm1 # ymm1 = {htonll(g), htonll(h), htonll(c), htonll(d)}
vpshufb %ymm0, %ymm2, %ymm2 # ymm2 = {htonll(e), htonll(f), htonll(a), htonll(b)}
movl 12(%esp), %edx # edx = address of digest
vperm2i128 $19, %ymm2, %ymm1, %ymm0
# ymm0 = {htonll(a), htonll(b), htonll(c), htonll(d)}
vmovdqu %ymm0, (%edx)
vperm2i128 $2, %ymm2, %ymm1, %ymm0
# ymm0 = {htonll(e), htonll(f), htonll(g), htonll(h)}
vmovdqu %ymm0, 32(%edx)
retl
.global sha512_final
.size sha512_final, .-sha512_final
.type sha512_final, @function
sha512_init: # void SHA512_Init(SHA512_CTX *context)
movl 4(%esp), %edx # edx = address of context structure
pxor %xmm0, %xmm0 # xmm0 = 0
vmovdqa hgdc, %ymm1 # ymm1 = {H7, H6, H3, H2}
vmovdqa feba, %ymm2 # ymm2 = {H5, H4, H1, H0}
movdqu %xmm0, count(%edx) # count = 0
vmovdqu %ymm1, state(%edx)
vmovdqu %ymm2, state+32(%edx) # state = {H7, H6, H3, H2, H5, H4, H1, H0}
# = {h, g, d, c, f, e, b, a}
retl
.global sha512_init
.size sha512_init, .-sha512_init
.type sha512_init, @function
sha512_update: # void SHA512_Update(SHA512_CTX *context,
# void const *data,
# unsigned long long size)
movl 12(%esp), %eax
movl 16(%esp), %edx # edx:eax = number of bytes in data
movl %eax, %ecx
orl %edx, %ecx
jz .Lnone # no data?
pushl %ebx
movl 8(%esp), %ebx # ebx = address of context structure
movl count(%ebx), %ecx # ecx = low(est) double word of count
andl $127, %ecx # ecx = number of bytes in block
# = index of first free byte in block
addl %eax, count(%ebx)
adcl %edx, count+4(%ebx)
adcl $0, count+8(%ebx)
adcl $0, count+12(%ebx) # count += number of bytes in data
pushl %esi
movl 16(%esp), %esi # esi = address of data
pushl %edi
.Ldata:
leal block(%edx, %ecx), %edi # edi = address of first free byte in block
xorl $127, %ecx
incl %ecx # ecx = number of free bytes in block
subl %ecx, %eax
sbbl $0, %edx # edx:eax = number of bytes in data
# - number of free bytes in block
jb .Llast # number of bytes in data < number of free bytes in block?
.Lmore:
rep movsb # esi = address of remaining data
movl %edx, %edi
pushl %ebx
calll sha512_core
popl %ebx
movl %edi, %edx # edx:eax = number of remaining bytes in data
xorl %ecx, %ecx # ecx = 0 = index of first free byte in block
orl %eax, %edi
jnz .Ldata # more data?
popl %edi
popl %esi
popl %ebx
.Lnone:
retl
.Llast:
addl %eax, %ecx # ecx = number of bytes in data
rep movsb
popl %edi
popl %esi
popl %ebx
retl
.global sha512_update
.size sha512_update, .-sha512_update
.type sha512_update, @function
.end
#include
the following
ANSI C
header file in your
ANSI C
sources:
// Copyleft © 2004-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>
typedef struct _sha512_ctx { // NOTE: should be 32-byte aligned!
unsigned long long state[8], block[16], count[2];
} sha512_ctx;
extern void sha512_core(sha512_ctx *context);
extern void sha512_final(sha512_ctx *context, unsigned char digest[64]);
extern void sha512_init(sha512_ctx *context);
extern void sha512_update(sha512_ctx *context, void const *data, unsigned long long size);
SHA512
and AVX2
Instructions for MASM; Copyright © 2017-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>
core textequ <SHA512_Core>
final textequ <SHA512_Final>
init textequ <SHA512_Init>
update textequ <SHA512_Update>
context struct 32 ; SHA512_CTX structure
state ymmword 2 dup (?) ; {h, g, d, c, f, e, b, a}
block ymmword 4 dup (?)
count qword 2 dup (?)
context ends
vsha512msg1 macro dst :req, src :req
.erre 16 and (opattr &dst) and (opattr &src)
byte 196, 226, 127 ;; 3-byte prefix VEX.256.F2.0F38.W0.VVVV0
byte 204, 192+8*@SubStr(&dst, 4, 1)+@SubStr(&src, 4, 1)
endm
vsha512msg2 macro dst :req, src :req
.erre 16 and (opattr &dst) and (opattr &src)
byte 196, 226, 127 ;; 3-byte prefix VEX.256.F2.0F38.W0.VVVV0
byte 205, 192+8*@SubStr(&dst, 4, 1)+@SubStr(&src, 4, 1)
endm
vsha512rnds2 macro dst :req, vvvv :req, src :req
.erre 16 and (opattr &dst) and (opattr &vvvv) and (opattr &src)
byte 196, 226, 127-8*@SubStr(&vvvv, 4)
;; 3-byte prefix VEX.256.F2.0F38.W0.VVVV
byte 203, 192+8*@SubStr(&dst, 4, 1)+@SubStr(&src, 4, 1)
endm
sha512 macro r0 :req, r1 :req, r2 :req, r3 :req, s :req
.erre 16 and (opattr &r0) and (opattr &r1) and (opattr &r2) and (opattr &r3)
.erre 4 and (opattr &s)
.erre 0 le &s and &s lt 80
.errnz 3 and &s
if &s lt 16
vmovdqu &r0, context.block[rcx+&s*4]
vpshufb &r0, &r0, ymm3 ;; &r0 = {ntohll(block[&s*4]), ntohll(block[&s*4+1]),
;; ntohll(block[&s*4+2]), ntohll(block[&s*4+3])}
;; = {w[&s*4], w[&s*4+1], w[&s*4+2], w[&s*4+3]}
else ; &s ge 16
;; &r0 = {w[&s*4-16], w[&s*4-15], w[&s*4-14], w[&s*4-13]},
;; &r1 = {w[&s*4-12], w[&s*4-11], w[&s*4-10], w[&s*4-9]},
;; &r2 = {w[&s*4-8], w[&s*4-7], w[&s*4-6], w[&s*4-5]},
;; &r3 = {w[&s*4-4], w[&s*4-3], w[&s*4-2], w[&s*4-1]}
vsha512msg1 &r0, &r1
vpblendd ymm0, &r2, &r3, 3 ;; ymm0 = {w[&s*4-4], w[&s*4-7], w[&s*4-6], w[&s*4-5]}
vpermq ymm0, ymm0, 57 ;; ymm0 = {w[&s*4-7], w[&s*4-6], w[&s*4-5], w[&s*4-4]}
vpaddq &r0, &r0, ymm0
vsha512msg2 &r0, &r3 ;; &r0 = {w[&s*4], w[&s*4+1], w[&s*4+2], w[&s*4+3]}
endif ; &s ge 16
vpaddq ymm0, &r0, [rdx+&s mod 16*32]
;; ymm0 = {(w+k)[&s*4], (w+k)[&s*4+1], (w+k)[&s*4+2], (w+k)[&s*4+3]}
vsha512rnds2 ymm2, ymm1, ymm0 ;; ymm2 = {f', e', b', a'},
;; ymm1 = {h', g', d', c'}
if 1
vperm2i128 ymm0, ymm0, ymm0, 129;; ymm0 = {(w+k)[&s*4+2], (w+k)[&s*4+3], 0, 0}
elseif 1
vextracti128 xmm0, ymm0, 1 ;; ymm0 = {(w+k)[&s*4+2], (w+k)[&s*4+3], 0, 0}
else
vpermq ymm0, ymm0, 78 ;; ymm0 = {(w+k)[&s*4+2], (w+k)[&s*4+3], (w+k)[&s*4+2], (w+k)[&s*4+3]}
endif
vsha512rnds2 ymm1, ymm2, ymm0 ;; ymm1 = {f", e", b", a"},
;; ymm2 = {h", g", d", c"}
endm
.const
; constants for endian conversion
reverse label ymmword
oword 000102030405060708090A0B0C0D0E0Fh
oword 000102030405060708090A0B0C0D0E0Fh
endian label ymmword
qword 0001020304050607h, 08090A0B0C0D0E0Fh
qword 0001020304050607h, 08090A0B0C0D0E0Fh
hgdc label ymmword ; SHA-512 start values: H7, H6, H3, H2
qword 05BE0CD19137E2179h, 01F83D9ABFB41BD6Bh, 0A54FF53A5F1D36F1h, 03C6EF372FE94F82Bh
feba label ymmword ; SHA-512 start values: H5, H4, H1, H0
qword 09B05688C2B3E6C1Fh, 0510E527FADE682D1h, 0BB67AE8584CAA73Bh, 06A09E667F3BCC908h
k512 label ymmword ; SHA-512 round constants
qword 0428A2F98D728AE22h, 07137449123EF65CDh, 0B5C0FBCFEC4D3B2Fh, 0E9B5DBA58189DBBCh
qword 03956C25BF348B538h, 059F111F1B605D019h, 0923F82A4AF194F9Bh, 0AB1C5ED5DA6D8118h
qword 0D807AA98A3030242h, 012835B0145706FBEh, 0243185BE4EE4B28Ch, 0550C7DC3D5FFB4E2h
qword 072BE5D74F27B896Fh, 080DEB1FE3B1696B1h, 09BDC06A725C71235h, 0C19BF174CF692694h
qword 0E49B69C19EF14AD2h, 0EFBE4786384F25E3h, 00FC19DC68B8CD5B5h, 0240CA1CC77AC9C65h
qword 02DE92C6F592B0275h, 04A7484AA6EA6E483h, 05CB0A9DCBD41FBD4h, 076F988DA831153B5h
qword 0983E5152EE66DFABh, 0A831C66D2DB43210h, 0B00327C898FB213Fh, 0BF597FC7BEEF0EE4h
qword 0C6E00BF33DA88FC2h, 0D5A79147930AA725h, 006CA6351E003826Fh, 0142929670A0E6E70h
qword 027B70A8546D22FFCh, 02E1B21385C26C926h, 04D2C6DFC5AC42AEDh, 053380D139D95B3DFh
qword 0650A73548BAF63DEh, 0766A0ABB3C77B2A8h, 081C2C92E47EDAEE6h, 092722C851482353Bh
qword 0A2BFE8A14CF10364h, 0A81A664BBC423001h, 0C24B8B70D0F89791h, 0C76C51A30654BE30h
qword 0D192E819D6EF5218h, 0D69906245565A910h, 0F40E35855771202Ah, 0106AA07032BBD1B8h
qword 019A4C116B8D2D0C8h, 01E376C085141AB53h, 02748774CDF8EEB99h, 034B0BCB5E19B48A8h
qword 0391C0CB3C5C95A63h, 04ED8AA4AE3418ACBh, 05B9CCA4F7763E373h, 0682E6FF3D6B2B8A3h
qword 0748F82EE5DEFB2FCh, 078A5636F43172F60h, 084C87814A1F0AB72h, 08CC702081A6439ECh
qword 090BEFFFA23631E28h, 0A4506CEBDE82BDE9h, 0BEF9A3F7B2C67915h, 0C67178F2E372532Bh
qword 0CA273ECEEA26619Ch, 0D186B8C721C0C207h, 0EADA7DD6CDE0EB1Eh, 0F57D4F7FEE6ED178h
qword 006F067AA72176FBAh, 00A637DC5A2C898A6h, 0113F9804BEF90DAEh, 01B710B35131C471Bh
qword 028DB77F523047D84h, 032CAAB7B40C72493h, 03C9EBE0A15C9BEBCh, 0431D67C49C100D4Ch
qword 04CC5D4BECB3E42B6h, 0597F299CFC657E2Ah, 05FCB6FAB3AD6FAECh, 06C44198C4A475817h
.code
core proc public ; void SHA512_Core(SHA512_CTX *context)
movdqa [rsp+8], xmm6
movdqa [rsp+24], xmm7
vmovdqu ymm1, context.state[rcx+32]
; ymm1 = {f, e, b, a}
vmovdqu ymm2, context.state[rcx]; ymm2 = {h, g, d, c}
vmovdqa ymm3, endian
irp t, <0, 16, 32, 48, 64> ; 5*4*4 rounds
if &t eq 0
lea rdx, k512
else
sub rdx, -128
endif
sha512 ymm4, ymm5, ymm6, ymm7, %(&t+0)
sha512 ymm5, ymm6, ymm7, ymm4, %(&t+4)
sha512 ymm6, ymm7, ymm4, ymm5, %(&t+8)
sha512 ymm7, ymm4, ymm5, ymm6, %(&t+12)
endm
vmovdqu ymm7, context.state[rcx+32]
; ymm7 = {f, e, b, a}
vmovdqu ymm6, context.state[rcx]; ymm6 = {h, g, d, c}
vpaddq ymm1, ymm1, ymm7 ; ymm1 = {f', e', b', a'} + {f, e, b, a}
; = {f", e", b", a"}
vpaddq ymm2, ymm2, ymm6 ; ymm2 = {h', g', d', c'} + {h, g, d, c}
; = {h", g", d", c"}
vmovdqu context.state[rcx+32], ymm1
vmovdqu context.state[rcx], ymm2; state = {h", g", d", c", f", e", b", a"}
movdqa xmm6, [rsp+8]
movdqa xmm7, [rsp+24]
ret
core endp
final proc public ; void SHA512_Final(SHA512_CTX *context,
; unsigned char digest[64])
mov r9, rcx ; r9 = address of context structure
mov rcx, context.count[r9] ; rcx = count
and ecx, sizeof context.block - 1
; ecx = number of bytes in block
; = index of first free byte in block
push rdi
lea rdi, context.block[r9+rcx]
mov r8, rdi ; r8 = address of first free byte in block
pad_1:
mov al, 10000000y
stosb ; block[index] = 0b10000000
pad_0:
xor eax, eax ; rax = 0
xor ecx, sizeof context.block - 1
; ecx = number of free bytes in block - 1
; = 127 - index
rep stosb ; block[index + 1, 127] = 0
sub rdi, sizeof context.count
; rdi = address of last oword in block
cmp r8, rdi
jb short pad_count ; space for count available in block?
; index < 112?
mov rcx, r9 ; rcx = address of context structure
push r9
push r8
push rdx
push rax
sub rsp, 32 ; "home space"
call core
add rsp, 32
pop rax
pop rdx
pop r8
pop r9
pad_block:
mov rcx, rdi ; rcx = address of last oword in block
lea rdi, context.block[r9] ; rdi = address of block
;; xor eax, eax ; rax = 0
sub rcx, rdi ; rcx = number of bytes before last oword
; = 56
rep stosb ; block[0, 111] = 0,
; rdi = address of last oword in block
pad_count:
mov rax, context.count[r9+8]
mov rcx, context.count[r9] ; rax:rcx = count
shld rax, rcx, 3
shl rcx, 3 ; rax:rcx = count * 8
; = number of message bits
bswap rax
bswap rcx ; rax:rcx = htonll(number of message bits)
stosq
mov rax, rcx
stosq ; block[112, 127] = number of message bits
mov rcx, r9 ; rcx = address of context structure
push r9
push rdx
sub rsp, 32 ; "home space"
call core
add rsp, 32
pop rdx ; rdx = address of digest
pop r9 ; r9 = address of context structure
pop rdi
vmovdqa ymm0, reverse
vmovdqu ymm1, context.state[r9] ; ymm1 = {h, g, d, c}
vmovdqu ymm2, context.state[r9+32]
; ymm2 = {f, e, b, a}
vpshufb ymm1, ymm1, ymm0 ; ymm1 = {htonll(g), htonll(h), htonll(c), htonll(d)}
vpshufb ymm2, ymm2, ymm0 ; ymm2 = {htonll(e), htonll(f), htonll(a), htonll(b)}
vperm2i128 ymm0, ymm1, ymm2, 19 ; ymm0 = {htonll(a), htonll(b), htonll(c), htonll(d)}
vmovdqu [rdx], ymm0
vperm2i128 ymm0, ymm1, ymm2, 2 ; ymm0 = {htonll(e), htonll(f), htonll(g), htonll(h)}
vmovdqu [rdx+32], ymm0
ret
final endp
init proc public ; void SHA512_Init(SHA512_CTX *context)
pxor xmm0, xmm0 ; xmm0 = 0
vmovdqa ymm1, hgdc ; ymm1 = {H7, H6, H3, H2}
vmovdqa ymm2, feba ; ymm2 = {H5, H4, H1, H0}
movdqu context.count[rcx], xmm0; count = 0
vmovdqu context.state[rcx], ymm1
vmovdqu context.state[rcx+32], ymm2
; state = {H7, H6, H3, H2, H5, H4, H1, H0}
; = {h, g, d, c, f, e, b, a}
ret
init endp
update proc public ; void SHA512_Update(SHA512_CTX *context,
; void const *data,
; unsigned int size)
test r8, r8
jz short none ; no data?
mov r9, rcx ; r9 = address of context structure
mov rcx, context.count[r9] ; rcx = low qword of count
and ecx, sizeof context.block - 1
; ecx = number of bytes in block
; = index of first free byte in block
add context.count[r9], r8
adc context.count[r9+8], 0 ; count += number of bytes in data
push rsi
mov rsi, rdx ; rsi = address of data
push rdi
data:
lea rdi, context.block[r9+rcx]
; rdi = address of first free byte in block
xor ecx, sizeof context.block - 1
inc ecx ; rcx = number of free bytes in block
sub r8, rcx ; r8 = number of bytes in data
; - number of free bytes in block
jb short last ; number of bytes in data < number of free bytes in block?
more:
rep movsb ; rsi = address of remaining data
mov rdi, r9
mov rcx, r9 ; rcx = address of context structure
push r8
sub rsp, 32 ; "home space"
call core
add rsp, 32
pop r8
mov r9, rdi
xor ecx, ecx ; rcx = 0 = index of first free byte in block
test r8, r8
jnz short data ; more data?
pop rdi
pop rsi
none:
ret
last:
add rcx, r8 ; rcx = number of bytes in data
rep movsb
pop rdi
pop rsi
ret
update endp
end
Note: the function SHA512_Core()
has
187 instructions in 984 bytes plus 544 bytes read-only data.
Microsoft Macro Assembler Reference
Save the AMD64 assembler source presented above as
sha-512.asm
in an arbitrary, preferable empty
directory, then execute the following 2 command lines to generate
the 64-bit object file sha-512.obj
:
SET ML=/c /W3 /X ML64.EXE /DALIGNED sha-512.asmFor details and reference see the MSDN article ML and ML64 Command-Line Reference.
Note: if necessary, see the MSDN article Use the Microsoft C++ toolset from the command line for an introduction.
Note: the command lines can be copied and pasted as block into a Command Processor window.
Microsoft (R) Macro Assembler Version (x64) 14.16.27023.1 Copyright (C) Microsoft Corporation. All rights reserved. Assembling: sha-512.asmSave the following ANSI C header file as
sha-512.h
to #include
it in
your
ANSI C
sources:
// Copyleft © 2004-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>
typedef struct _SHA512_CTX { // NOTE: should be 32-byte aligned!
unsigned long long state[8], block[16], count[2];
} SHA512_CTX;
extern void SHA512_Core(SHA512_CTX *context);
extern void SHA512_Final(SHA512_CTX *context, unsigned char digest[64]);
extern void SHA512_Init(SHA512_CTX *context);
extern void SHA512_Update(SHA512_CTX *context, void const *data, unsigned long long size);
// Copyright © 2004-2024, Stefan Kanthak <stefan.kanthak@nexgo.de>
#define STRICT
#define UNICODE
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
__declspec(align(32))
typedef struct _SHA512_CTX
{
DWORD64 State[8];
DWORD64 Block[16];
DWORD64 Count[2];
} SHA512_CTX;
VOID CDECL SHA512_Core(SHA512_CTX *Context);
VOID CDECL SHA512_Final(SHA512_CTX *Context, BYTE Digest[64]);
VOID CDECL SHA512_Init(SHA512_CTX *Context);
VOID CDECL SHA512_Update(SHA512_CTX *Context, LPCVOID Data, DWORD Size);
#ifndef _M_IX86
#define __edivmodu(N, D) (DWORD) ((N) / (D)), (DWORD) ((N) % (D))
#else
__forceinline // companion for __emulu()
struct
{
DWORD ulQuotient, ulRemainder;
} CDECL __edivmodu(DWORD64 ullDividend, DWORD ulDivisor)
{
__asm mov eax, dword ptr ullDividend
__asm mov edx, dword ptr ullDividend+4
__asm div ulDivisor
}
#endif // _M_IX86
__declspec(safebuffers)
BOOL CDECL PrintConsole(HANDLE hConsole, [SA_FormatString(Style="printf")] LPCWSTR lpFormat, ...)
{
WCHAR szOutput[1024];
DWORD dwOutput;
DWORD dwConsole;
va_list vaInput;
va_start(vaInput, lpFormat);
dwOutput = wvsprintf(szOutput, lpFormat, vaInput);
va_end(vaInput);
if ((dwOutput == 0)
|| !WriteConsole(hConsole, szOutput, dwOutput, &dwConsole, NULL))
return FALSE;
return dwConsole == dwOutput;
}
__declspec(noreturn)
VOID CDECL wmainCRTStartup(VOID)
{
SHA512_CTX Context;
BYTE cbDigest[32], cbMillion[1000000];
DWORD dwCPUID[16];
DWORD dwError = ERROR_SUCCESS;
DWORD dwThread = 1000000000 / 64;
DWORD64 qwThread[2];
HANDLE hThread = GetCurrentThread();
HANDLE hConsole = GetStdHandle(STD_ERROR_HANDLE);
if (hConsole == INVALID_HANDLE_VALUE)
dwError = GetLastError();
else
{
__cpuid(dwCPUID, 0x80000000);
if (*dwCPUID >= 0x80000004)
{
__cpuid(dwCPUID + 4, 0x80000002);
__cpuid(dwCPUID + 8, 0x80000003);
__cpuid(dwCPUID + 12, 0x80000004);
}
else
__movsb(dwCPUID, "unidentified processor", sizeof("unidentified processor"));
__cpuidex(dwCPUID, 7, 1);
if (*dwCPUID & 1 == 0)
PrintConsole(hConsole, L"SHA512 instructions not supported on %.48hs!\n", dwCPUID + 4);
else
{
if (SetThreadIdealProcessor(hThread, 0) == -1)
PrintConsole(hConsole,
L"SetThreadIdealProcessor() returned error %lu\n",
dwError = GetLastError());
if (!SetThreadPriority(hThread, THREAD_PRIORITY_HIGHEST))
PrintConsole(hConsole,
L"SetThreadPriority() returned error %lu\n",
dwError = GetLastError());
PrintConsole(hConsole, L"\nTesting SHA-512 implementation...\n");
SHA512_Init(&Context);
SHA512_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"\"\n"
L"\tcf83e1357eefb8bd f1542850d66d8007 d620e4050b5715dc 83f4a921d36ce9ce\n"
L"\t%016I64X %016I64X %016I64X %016I64X\n\t%016I64X %016I64X %016I64X %016I64X\n"
L"\t47d0d13c5d85f2b0 ff8318d2877eec2f 63b931bd47417a81 a538327af927da3e\n",
Context.State[7], Context.State[6], Context.State[3], Context.State[2],
Context.State[5], Context.State[4], Context.State[1], Context.State[0]);
SHA512_Init(&Context);
SHA512_Update(&Context, "abc", 3);
SHA512_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"abc\"\n"
L"\tddaf35a193617aba cc417349ae204131 12e6fa4e89a97ea2 0a9eeee64b55d39a\n"
L"\t%016I64X %016I64X %016I64X %016I64X\n\t%016I64X %016I64X %016I64X %016I64X\n"
L"\t2192992a274fc1a8 36ba3c23a3feebbd 454d4423643ce80e 2a9ac94fa54ca49f\n",
Context.State[7], Context.State[6], Context.State[3], Context.State[2],
Context.State[5], Context.State[4], Context.State[1], Context.State[0]);
SHA512_Init(&Context);
SHA512_Update(&Context, "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu",
sizeof("abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu") - 1);
SHA512_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu\"\n"
L"\t8e959b75dae313da 8cf4f72814fc143f 8f7779c6eb9f7fa1 7299aeadb6889018\n"
L"\t%016I64X %016I64X %016I64X %016I64X\n\t%016I64X %016I64X %016I64X %016I64X\n"
L"\t501d289e4900f7e4 331b99dec4b5433a c7d329eeb6dd2654 5e96e55b874be909\n",
Context.State[7], Context.State[6], Context.State[3], Context.State[2],
Context.State[5], Context.State[4], Context.State[1], Context.State[0]);
__stosb(cbMillion, 0, 1000);
SHA512_Init(&Context);
SHA512_Update(&Context, cbMillion, 111);
SHA512_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\'\\0\'×111\n"
L"\t77ddd3a542e530fd 047b8977c657ba6c e72f1492e360b2b2 212cd264e75ec038\n"
L"\t%016I64X %016I64X %016I64X %016I64X\n\t%016I64X %016I64X %016I64X %016I64X\n"
L"\t82e4ff0525517ab4 207d14c70c2259ba 88d4d335ee0e7e20 543d22102ab1788c\n",
Context.State[7], Context.State[6], Context.State[3], Context.State[2],
Context.State[5], Context.State[4], Context.State[1], Context.State[0]);
SHA512_Init(&Context);
SHA512_Update(&Context, cbMillion, 112);
SHA512_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\'\\0\'×112\n"
L"\t2be2e788c8a8adea a9c89a7f78904cac ea6e39297d75e057 3a73c756234534d6\n"
L"\t%016I64X %016I64X %016I64X %016I64X\n\t%016I64X %016I64X %016I64X %016I64X\n"
L"\t627ab4156b48a665 7b29ab8beb733340 40ad39ead81446bb 09c70704ec707952\n",
Context.State[7], Context.State[6], Context.State[3], Context.State[2],
Context.State[5], Context.State[4], Context.State[1], Context.State[0]);
SHA512_Init(&Context);
SHA512_Update(&Context, cbMillion, 113);
SHA512_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\'\\0\'×113\n"
L"\t0e67910bcf0f9ccd e5464c63b9c850a1 2a759227d16b040d 98986d54253f9f34\n"
L"\t%016I64X %016I64X %016I64X %016I64X\n\t%016I64X %016I64X %016I64X %016I64X\n"
L"\t322318e56b8feb86 c5fb2270ed87f312 52f7f68493ee7597 43909bd75e4bb544\n",
Context.State[7], Context.State[6], Context.State[3], Context.State[2],
Context.State[5], Context.State[4], Context.State[1], Context.State[0]);
SHA512_Init(&Context);
SHA512_Update(&Context, cbMillion, 122);
SHA512_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\'\\0\'×122\n"
L"\t4f3f095d015be4a7 a7cc0b8c04da4aa0 9e74351e3a97651f 744c23716ebd9b3e\n"
L"\t%016I64X %016I64X %016I64X %016I64X\n\t%016I64X %016I64X %016I64X %016I64X\n"
L"\t822e5077a01baa5c c0ed45b9249e88ab 343d4333539df21e d229da6f4a514e0f\n",
Context.State[7], Context.State[6], Context.State[3], Context.State[2],
Context.State[5], Context.State[4], Context.State[1], Context.State[0]);
SHA512_Init(&Context);
SHA512_Update(&Context, cbMillion, 1000);
SHA512_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\'\\0\'×1000\n"
L"\tca3dff61bb23477a a6087b27508264a6 f9126ee3a004f53c b8db942ed345f2f2\n"
L"\t%016I64X %016I64X %016I64X %016I64X\n\t%016I64X %016I64X %016I64X %016I64X\n"
L"\td229b4b59c859220 a1cf1913f34248e3 803bab650e849a3d 9a709edc09ae4a76\n",
Context.State[7], Context.State[6], Context.State[3], Context.State[2],
Context.State[5], Context.State[4], Context.State[1], Context.State[0]);
__stosb(cbMillion, 'A', 1000);
SHA512_Init(&Context);
SHA512_Update(&Context, cbMillion, 1000);
SHA512_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"A…A\"\n"
L"\t329c52ac62d1fe73 1151f2b895a00475 445ef74f50b979c6 f7bb7cae349328c1\n"
L"\t%016I64X %016I64X %016I64X %016I64X\n\t%016I64X %016I64X %016I64X %016I64X\n"
L"\td4cb4f7261a0ab43 f936a24b000651d4 a824fcdd577f211a ef8f806b16afe8af\n",
Context.State[7], Context.State[6], Context.State[3], Context.State[2],
Context.State[5], Context.State[4], Context.State[1], Context.State[0]);
__stosb(cbMillion, 'U', 1005);
SHA512_Init(&Context);
SHA512_Update(&Context, cbMillion, 1005);
SHA512_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"U…U\"\n"
L"\t59f5e54fe299c6a8 764c6b199e44924a 37f59e2b56c3ebad 939b7289210dc8e4\n"
L"\t%016I64X %016I64X %016I64X %016I64X\n\t%016I64X %016I64X %016I64X %016I64X\n"
L"\tc21b9720165b0f4d 4374c90f1bf4fb4a 5ace17a116179801 5052893a48c3d161\n",
Context.State[7], Context.State[6], Context.State[3], Context.State[2],
Context.State[5], Context.State[4], Context.State[1], Context.State[0]);
__stosb(cbMillion, 'a', 1000000);
SHA512_Init(&Context);
SHA512_Update(&Context, cbMillion, 1000000);
SHA512_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\"a…a\"\n"
L"\te718483d0ce76964 4e2e42c7bc15b463 8e1f98b13b204428 5632a803afa973eb\n"
L"\t%016I64X %016I64X %016I64X %016I64X\n\t%016I64X %016I64X %016I64X %016I64X\n"
L"\tde0ff244877ea60a 4cb0432ce577c31b eb009c5c2c49aa2e 4eadb217ad8cc09b\n",
Context.State[7], Context.State[6], Context.State[3], Context.State[2],
Context.State[5], Context.State[4], Context.State[1], Context.State[0]);
__stosb(cbMillion, 0, 1000000);
SHA512_Init(&Context);
SHA512_Update(&Context, cbMillion, 1000000);
SHA512_Final(&Context, cbDigest);
PrintConsole(hConsole,
L"\'\\0\'×1000000\n"
L"\tce044bc9fd43269d 5bbc946cbebc3bb7 11341115cc4abdf2 edbc3ff2c57ad4b1\n"
L"\t%016I64X %016I64X %016I64X %016I64X\n\t%016I64X %016I64X %016I64X %016I64X\n"
L"\t5deb699bda257fea 5aef9c6e55fcf4cf 9dc25a8c3ce25f2e fe90908379bff7ed\n",
Context.State[7], Context.State[6], Context.State[3], Context.State[2],
Context.State[5], Context.State[4], Context.State[1], Context.State[0]);
PrintConsole(hConsole, L"\nTiming SHA-512 on %.48hs:\n", dwCPUID + 4);
#ifdef CYCLES
if (!QueryThreadCycleTime(hThread, qwThread))
PrintConsole(hConsole,
L"QueryThreadCycleTime() returned error %lu\n",
dwError = GetLastError());
else
{
do
SHA512_Core(&Context);
while (--dwThread);
if (!QueryThreadCycleTime(hThread, qwThread + 1))
PrintConsole(hConsole,
L"QueryThreadCycleTime() returned error %lu\n",
dwError = GetLastError());
else
PrintConsole(hConsole,
L"%lu.%09lu clock cycles per byte\n",
__edivmodu(qwThread[1] - qwThread[0], 1000000000));
}
#else
if (!GetThreadTimes(hThread, (LPFILETIME) qwThread + 1, (LPFILETIME) qwThread + 1, (LPFILETIME) qwThread + 1, (LPFILETIME) qwThread))
PrintConsole(hConsole,
L"QueryThreadTimes() returned error %lu\n",
dwError = GetLastError());
else
{
do
SHA512_Core(&Context);
while (--dwThread);
if (!GetThreadTimes(hThread, (LPFILETIME) qwThread, (LPFILETIME) qwThread, (LPFILETIME) qwThread, (LPFILETIME) qwThread + 1))
PrintConsole(hConsole,
L"QueryThreadTimes() returned error %lu\n",
dwError = GetLastError());
else
PrintConsole(hConsole,
L"%lu.%07lu nano-seconds per byte\n",
__edivmodu(qwThread[1] - qwThread[0], 10000000));
}
#endif // CYCLES
}
}
ExitProcess(dwError);
}
Save the
ANSI C
source presented above as sha-512.c
next to the object
file sha-512.obj
assembled before, then run the
following 4 command lines to build the 64-bit console application
sha-512.exe
and execute it:
SET CL=/GAFS- /Gs1049600 /Oxy /W4 /Zl SET LINK=/ENTRY:wmainCRTStartup /NODEFAULTLIB /STACK:2097152,1048576 /SUBSYSTEM:CONSOLE CL.EXE /DCYCLES /Fosha-512.tmp sha-512.c sha-512.obj kernel32.lib user32.lib .\sha-512.exeFor details and reference see the MSDN articles Compiler Options and Linker Options.
Note: if necessary, see the MSDN article Use the Microsoft C++ toolset from the command line for an introduction.
Note: the command lines can be copied and pasted as block into a Command Processor window.
Microsoft (R) C/C++ Optimizing Compiler Version 16.00.40219.01 for x64 Copyright (C) Microsoft Corporation. All rights reserved. sha-512.c Microsoft (R) Incremental Linker Version 10.00.40219.386 Copyright (C) Microsoft Corporation. All rights reserved. /ENTRY:mainCRTStartup /NODEFAULTLIB /STACK:2097152,1048576 /SUBSYSTEM:CONSOLE /out:sha-512.exe sha-512.tmp sha-512.obj kernel32.lib user32.lib SHA512 instructions not supported on AMD Ryzen 7 5700X 8-Core Processor !
With our implementation, a single thread of an Intel® Core™i7 processor 2600 can compute SHA-512 of a large data buffer at the rate of ~8.59 cycles/byte.
Use the X.509 certificate to send S/MIME encrypted mail.
Note: email in weird format and without a proper sender name is likely to be discarded!
I dislike
HTML (and even
weirder formats too) in email, I prefer to receive plain text.
I also expect to see your full (real) name as sender, not your
nickname.
I abhor top posts and expect inline quotes in replies.
as iswithout any warranty, neither express nor implied.
cookiesin the web browser.
The web service is operated and provided by
Telekom Deutschland GmbH The web service provider stores a session cookie
in the web
browser and records every visit of this web site with the following
data in an access log on their server(s):