Coverage Report

Created: 2024-10-29 12:15

/root/bitcoin/src/crypto/sha256_sse4.cpp
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) 2017-2022 The Bitcoin Core developers
2
// Distributed under the MIT software license, see the accompanying
3
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
4
//
5
// This is a translation to GCC extended asm syntax from YASM code by Intel
6
// (available at the bottom of this file).
7
8
#include <cstdlib>
9
#include <stdint.h>
10
11
#if defined(__x86_64__) || defined(__amd64__)
12
13
namespace sha256_sse4
14
{
15
void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks)
16
#if defined(__clang__) && !defined(__OPTIMIZE__)
17
  /*
18
  clang is unable to compile this with -O0 and -fsanitize=address.
19
  See upstream bug: https://github.com/llvm/llvm-project/issues/92182
20
  */
21
  __attribute__((no_sanitize("address")))
22
#endif
23
0
{
24
0
    static const uint32_t K256 alignas(16) [] = {
25
0
        0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
26
0
        0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
27
0
        0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
28
0
        0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
29
0
        0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
30
0
        0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
31
0
        0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
32
0
        0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
33
0
        0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
34
0
        0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
35
0
        0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
36
0
        0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
37
0
        0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
38
0
        0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
39
0
        0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
40
0
        0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
41
0
    };
42
0
    static const uint32_t FLIP_MASK alignas(16) [] = {0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f};
43
0
    static const uint32_t SHUF_00BA alignas(16) [] = {0x03020100, 0x0b0a0908, 0xffffffff, 0xffffffff};
44
0
    static const uint32_t SHUF_DC00 alignas(16) [] = {0xffffffff, 0xffffffff, 0x03020100, 0x0b0a0908};
45
0
    uint32_t a, b, c, d, f, g, h, y0, y1, y2;
46
0
    uint64_t tbl;
47
0
    uint64_t inp_end, inp;
48
0
    uint32_t xfer alignas(16) [4];
49
50
0
    __asm__ __volatile__(
51
0
        "shl    $0x6,%2;"
52
0
        "je     Ldone_hash_%=;"
53
0
        "add    %1,%2;"
54
0
        "mov    %2,%14;"
55
0
        "mov    (%0),%3;"
56
0
        "mov    0x4(%0),%4;"
57
0
        "mov    0x8(%0),%5;"
58
0
        "mov    0xc(%0),%6;"
59
0
        "mov    0x10(%0),%k2;"
60
0
        "mov    0x14(%0),%7;"
61
0
        "mov    0x18(%0),%8;"
62
0
        "mov    0x1c(%0),%9;"
63
0
        "movdqa %18,%%xmm12;"
64
0
        "movdqa %19,%%xmm10;"
65
0
        "movdqa %20,%%xmm11;"
66
67
0
        "Lloop0_%=:"
68
0
        "lea    %17,%13;"
69
0
        "movdqu (%1),%%xmm4;"
70
0
        "pshufb %%xmm12,%%xmm4;"
71
0
        "movdqu 0x10(%1),%%xmm5;"
72
0
        "pshufb %%xmm12,%%xmm5;"
73
0
        "movdqu 0x20(%1),%%xmm6;"
74
0
        "pshufb %%xmm12,%%xmm6;"
75
0
        "movdqu 0x30(%1),%%xmm7;"
76
0
        "pshufb %%xmm12,%%xmm7;"
77
0
        "mov    %1,%15;"
78
0
        "mov    $3,%1;"
79
80
0
        "Lloop1_%=:"
81
0
        "movdqa 0x0(%13),%%xmm9;"
82
0
        "paddd  %%xmm4,%%xmm9;"
83
0
        "movdqa %%xmm9,%16;"
84
0
        "movdqa %%xmm7,%%xmm0;"
85
0
        "mov    %k2,%10;"
86
0
        "ror    $0xe,%10;"
87
0
        "mov    %3,%11;"
88
0
        "palignr $0x4,%%xmm6,%%xmm0;"
89
0
        "ror    $0x9,%11;"
90
0
        "xor    %k2,%10;"
91
0
        "mov    %7,%12;"
92
0
        "ror    $0x5,%10;"
93
0
        "movdqa %%xmm5,%%xmm1;"
94
0
        "xor    %3,%11;"
95
0
        "xor    %8,%12;"
96
0
        "paddd  %%xmm4,%%xmm0;"
97
0
        "xor    %k2,%10;"
98
0
        "and    %k2,%12;"
99
0
        "ror    $0xb,%11;"
100
0
        "palignr $0x4,%%xmm4,%%xmm1;"
101
0
        "xor    %3,%11;"
102
0
        "ror    $0x6,%10;"
103
0
        "xor    %8,%12;"
104
0
        "movdqa %%xmm1,%%xmm2;"
105
0
        "ror    $0x2,%11;"
106
0
        "add    %10,%12;"
107
0
        "add    %16,%12;"
108
0
        "movdqa %%xmm1,%%xmm3;"
109
0
        "mov    %3,%10;"
110
0
        "add    %12,%9;"
111
0
        "mov    %3,%12;"
112
0
        "pslld  $0x19,%%xmm1;"
113
0
        "or     %5,%10;"
114
0
        "add    %9,%6;"
115
0
        "and    %5,%12;"
116
0
        "psrld  $0x7,%%xmm2;"
117
0
        "and    %4,%10;"
118
0
        "add    %11,%9;"
119
0
        "por    %%xmm2,%%xmm1;"
120
0
        "or     %12,%10;"
121
0
        "add    %10,%9;"
122
0
        "movdqa %%xmm3,%%xmm2;"
123
0
        "mov    %6,%10;"
124
0
        "mov    %9,%11;"
125
0
        "movdqa %%xmm3,%%xmm8;"
126
0
        "ror    $0xe,%10;"
127
0
        "xor    %6,%10;"
128
0
        "mov    %k2,%12;"
129
0
        "ror    $0x9,%11;"
130
0
        "pslld  $0xe,%%xmm3;"
131
0
        "xor    %9,%11;"
132
0
        "ror    $0x5,%10;"
133
0
        "xor    %7,%12;"
134
0
        "psrld  $0x12,%%xmm2;"
135
0
        "ror    $0xb,%11;"
136
0
        "xor    %6,%10;"
137
0
        "and    %6,%12;"
138
0
        "ror    $0x6,%10;"
139
0
        "pxor   %%xmm3,%%xmm1;"
140
0
        "xor    %9,%11;"
141
0
        "xor    %7,%12;"
142
0
        "psrld  $0x3,%%xmm8;"
143
0
        "add    %10,%12;"
144
0
        "add    4+%16,%12;"
145
0
        "ror    $0x2,%11;"
146
0
        "pxor   %%xmm2,%%xmm1;"
147
0
        "mov    %9,%10;"
148
0
        "add    %12,%8;"
149
0
        "mov    %9,%12;"
150
0
        "pxor   %%xmm8,%%xmm1;"
151
0
        "or     %4,%10;"
152
0
        "add    %8,%5;"
153
0
        "and    %4,%12;"
154
0
        "pshufd $0xfa,%%xmm7,%%xmm2;"
155
0
        "and    %3,%10;"
156
0
        "add    %11,%8;"
157
0
        "paddd  %%xmm1,%%xmm0;"
158
0
        "or     %12,%10;"
159
0
        "add    %10,%8;"
160
0
        "movdqa %%xmm2,%%xmm3;"
161
0
        "mov    %5,%10;"
162
0
        "mov    %8,%11;"
163
0
        "ror    $0xe,%10;"
164
0
        "movdqa %%xmm2,%%xmm8;"
165
0
        "xor    %5,%10;"
166
0
        "ror    $0x9,%11;"
167
0
        "mov    %6,%12;"
168
0
        "xor    %8,%11;"
169
0
        "ror    $0x5,%10;"
170
0
        "psrlq  $0x11,%%xmm2;"
171
0
        "xor    %k2,%12;"
172
0
        "psrlq  $0x13,%%xmm3;"
173
0
        "xor    %5,%10;"
174
0
        "and    %5,%12;"
175
0
        "psrld  $0xa,%%xmm8;"
176
0
        "ror    $0xb,%11;"
177
0
        "xor    %8,%11;"
178
0
        "xor    %k2,%12;"
179
0
        "ror    $0x6,%10;"
180
0
        "pxor   %%xmm3,%%xmm2;"
181
0
        "add    %10,%12;"
182
0
        "ror    $0x2,%11;"
183
0
        "add    8+%16,%12;"
184
0
        "pxor   %%xmm2,%%xmm8;"
185
0
        "mov    %8,%10;"
186
0
        "add    %12,%7;"
187
0
        "mov    %8,%12;"
188
0
        "pshufb %%xmm10,%%xmm8;"
189
0
        "or     %3,%10;"
190
0
        "add    %7,%4;"
191
0
        "and    %3,%12;"
192
0
        "paddd  %%xmm8,%%xmm0;"
193
0
        "and    %9,%10;"
194
0
        "add    %11,%7;"
195
0
        "pshufd $0x50,%%xmm0,%%xmm2;"
196
0
        "or     %12,%10;"
197
0
        "add    %10,%7;"
198
0
        "movdqa %%xmm2,%%xmm3;"
199
0
        "mov    %4,%10;"
200
0
        "ror    $0xe,%10;"
201
0
        "mov    %7,%11;"
202
0
        "movdqa %%xmm2,%%xmm4;"
203
0
        "ror    $0x9,%11;"
204
0
        "xor    %4,%10;"
205
0
        "mov    %5,%12;"
206
0
        "ror    $0x5,%10;"
207
0
        "psrlq  $0x11,%%xmm2;"
208
0
        "xor    %7,%11;"
209
0
        "xor    %6,%12;"
210
0
        "psrlq  $0x13,%%xmm3;"
211
0
        "xor    %4,%10;"
212
0
        "and    %4,%12;"
213
0
        "ror    $0xb,%11;"
214
0
        "psrld  $0xa,%%xmm4;"
215
0
        "xor    %7,%11;"
216
0
        "ror    $0x6,%10;"
217
0
        "xor    %6,%12;"
218
0
        "pxor   %%xmm3,%%xmm2;"
219
0
        "ror    $0x2,%11;"
220
0
        "add    %10,%12;"
221
0
        "add    12+%16,%12;"
222
0
        "pxor   %%xmm2,%%xmm4;"
223
0
        "mov    %7,%10;"
224
0
        "add    %12,%k2;"
225
0
        "mov    %7,%12;"
226
0
        "pshufb %%xmm11,%%xmm4;"
227
0
        "or     %9,%10;"
228
0
        "add    %k2,%3;"
229
0
        "and    %9,%12;"
230
0
        "paddd  %%xmm0,%%xmm4;"
231
0
        "and    %8,%10;"
232
0
        "add    %11,%k2;"
233
0
        "or     %12,%10;"
234
0
        "add    %10,%k2;"
235
0
        "movdqa 0x10(%13),%%xmm9;"
236
0
        "paddd  %%xmm5,%%xmm9;"
237
0
        "movdqa %%xmm9,%16;"
238
0
        "movdqa %%xmm4,%%xmm0;"
239
0
        "mov    %3,%10;"
240
0
        "ror    $0xe,%10;"
241
0
        "mov    %k2,%11;"
242
0
        "palignr $0x4,%%xmm7,%%xmm0;"
243
0
        "ror    $0x9,%11;"
244
0
        "xor    %3,%10;"
245
0
        "mov    %4,%12;"
246
0
        "ror    $0x5,%10;"
247
0
        "movdqa %%xmm6,%%xmm1;"
248
0
        "xor    %k2,%11;"
249
0
        "xor    %5,%12;"
250
0
        "paddd  %%xmm5,%%xmm0;"
251
0
        "xor    %3,%10;"
252
0
        "and    %3,%12;"
253
0
        "ror    $0xb,%11;"
254
0
        "palignr $0x4,%%xmm5,%%xmm1;"
255
0
        "xor    %k2,%11;"
256
0
        "ror    $0x6,%10;"
257
0
        "xor    %5,%12;"
258
0
        "movdqa %%xmm1,%%xmm2;"
259
0
        "ror    $0x2,%11;"
260
0
        "add    %10,%12;"
261
0
        "add    %16,%12;"
262
0
        "movdqa %%xmm1,%%xmm3;"
263
0
        "mov    %k2,%10;"
264
0
        "add    %12,%6;"
265
0
        "mov    %k2,%12;"
266
0
        "pslld  $0x19,%%xmm1;"
267
0
        "or     %8,%10;"
268
0
        "add    %6,%9;"
269
0
        "and    %8,%12;"
270
0
        "psrld  $0x7,%%xmm2;"
271
0
        "and    %7,%10;"
272
0
        "add    %11,%6;"
273
0
        "por    %%xmm2,%%xmm1;"
274
0
        "or     %12,%10;"
275
0
        "add    %10,%6;"
276
0
        "movdqa %%xmm3,%%xmm2;"
277
0
        "mov    %9,%10;"
278
0
        "mov    %6,%11;"
279
0
        "movdqa %%xmm3,%%xmm8;"
280
0
        "ror    $0xe,%10;"
281
0
        "xor    %9,%10;"
282
0
        "mov    %3,%12;"
283
0
        "ror    $0x9,%11;"
284
0
        "pslld  $0xe,%%xmm3;"
285
0
        "xor    %6,%11;"
286
0
        "ror    $0x5,%10;"
287
0
        "xor    %4,%12;"
288
0
        "psrld  $0x12,%%xmm2;"
289
0
        "ror    $0xb,%11;"
290
0
        "xor    %9,%10;"
291
0
        "and    %9,%12;"
292
0
        "ror    $0x6,%10;"
293
0
        "pxor   %%xmm3,%%xmm1;"
294
0
        "xor    %6,%11;"
295
0
        "xor    %4,%12;"
296
0
        "psrld  $0x3,%%xmm8;"
297
0
        "add    %10,%12;"
298
0
        "add    4+%16,%12;"
299
0
        "ror    $0x2,%11;"
300
0
        "pxor   %%xmm2,%%xmm1;"
301
0
        "mov    %6,%10;"
302
0
        "add    %12,%5;"
303
0
        "mov    %6,%12;"
304
0
        "pxor   %%xmm8,%%xmm1;"
305
0
        "or     %7,%10;"
306
0
        "add    %5,%8;"
307
0
        "and    %7,%12;"
308
0
        "pshufd $0xfa,%%xmm4,%%xmm2;"
309
0
        "and    %k2,%10;"
310
0
        "add    %11,%5;"
311
0
        "paddd  %%xmm1,%%xmm0;"
312
0
        "or     %12,%10;"
313
0
        "add    %10,%5;"
314
0
        "movdqa %%xmm2,%%xmm3;"
315
0
        "mov    %8,%10;"
316
0
        "mov    %5,%11;"
317
0
        "ror    $0xe,%10;"
318
0
        "movdqa %%xmm2,%%xmm8;"
319
0
        "xor    %8,%10;"
320
0
        "ror    $0x9,%11;"
321
0
        "mov    %9,%12;"
322
0
        "xor    %5,%11;"
323
0
        "ror    $0x5,%10;"
324
0
        "psrlq  $0x11,%%xmm2;"
325
0
        "xor    %3,%12;"
326
0
        "psrlq  $0x13,%%xmm3;"
327
0
        "xor    %8,%10;"
328
0
        "and    %8,%12;"
329
0
        "psrld  $0xa,%%xmm8;"
330
0
        "ror    $0xb,%11;"
331
0
        "xor    %5,%11;"
332
0
        "xor    %3,%12;"
333
0
        "ror    $0x6,%10;"
334
0
        "pxor   %%xmm3,%%xmm2;"
335
0
        "add    %10,%12;"
336
0
        "ror    $0x2,%11;"
337
0
        "add    8+%16,%12;"
338
0
        "pxor   %%xmm2,%%xmm8;"
339
0
        "mov    %5,%10;"
340
0
        "add    %12,%4;"
341
0
        "mov    %5,%12;"
342
0
        "pshufb %%xmm10,%%xmm8;"
343
0
        "or     %k2,%10;"
344
0
        "add    %4,%7;"
345
0
        "and    %k2,%12;"
346
0
        "paddd  %%xmm8,%%xmm0;"
347
0
        "and    %6,%10;"
348
0
        "add    %11,%4;"
349
0
        "pshufd $0x50,%%xmm0,%%xmm2;"
350
0
        "or     %12,%10;"
351
0
        "add    %10,%4;"
352
0
        "movdqa %%xmm2,%%xmm3;"
353
0
        "mov    %7,%10;"
354
0
        "ror    $0xe,%10;"
355
0
        "mov    %4,%11;"
356
0
        "movdqa %%xmm2,%%xmm5;"
357
0
        "ror    $0x9,%11;"
358
0
        "xor    %7,%10;"
359
0
        "mov    %8,%12;"
360
0
        "ror    $0x5,%10;"
361
0
        "psrlq  $0x11,%%xmm2;"
362
0
        "xor    %4,%11;"
363
0
        "xor    %9,%12;"
364
0
        "psrlq  $0x13,%%xmm3;"
365
0
        "xor    %7,%10;"
366
0
        "and    %7,%12;"
367
0
        "ror    $0xb,%11;"
368
0
        "psrld  $0xa,%%xmm5;"
369
0
        "xor    %4,%11;"
370
0
        "ror    $0x6,%10;"
371
0
        "xor    %9,%12;"
372
0
        "pxor   %%xmm3,%%xmm2;"
373
0
        "ror    $0x2,%11;"
374
0
        "add    %10,%12;"
375
0
        "add    12+%16,%12;"
376
0
        "pxor   %%xmm2,%%xmm5;"
377
0
        "mov    %4,%10;"
378
0
        "add    %12,%3;"
379
0
        "mov    %4,%12;"
380
0
        "pshufb %%xmm11,%%xmm5;"
381
0
        "or     %6,%10;"
382
0
        "add    %3,%k2;"
383
0
        "and    %6,%12;"
384
0
        "paddd  %%xmm0,%%xmm5;"
385
0
        "and    %5,%10;"
386
0
        "add    %11,%3;"
387
0
        "or     %12,%10;"
388
0
        "add    %10,%3;"
389
0
        "movdqa 0x20(%13),%%xmm9;"
390
0
        "paddd  %%xmm6,%%xmm9;"
391
0
        "movdqa %%xmm9,%16;"
392
0
        "movdqa %%xmm5,%%xmm0;"
393
0
        "mov    %k2,%10;"
394
0
        "ror    $0xe,%10;"
395
0
        "mov    %3,%11;"
396
0
        "palignr $0x4,%%xmm4,%%xmm0;"
397
0
        "ror    $0x9,%11;"
398
0
        "xor    %k2,%10;"
399
0
        "mov    %7,%12;"
400
0
        "ror    $0x5,%10;"
401
0
        "movdqa %%xmm7,%%xmm1;"
402
0
        "xor    %3,%11;"
403
0
        "xor    %8,%12;"
404
0
        "paddd  %%xmm6,%%xmm0;"
405
0
        "xor    %k2,%10;"
406
0
        "and    %k2,%12;"
407
0
        "ror    $0xb,%11;"
408
0
        "palignr $0x4,%%xmm6,%%xmm1;"
409
0
        "xor    %3,%11;"
410
0
        "ror    $0x6,%10;"
411
0
        "xor    %8,%12;"
412
0
        "movdqa %%xmm1,%%xmm2;"
413
0
        "ror    $0x2,%11;"
414
0
        "add    %10,%12;"
415
0
        "add    %16,%12;"
416
0
        "movdqa %%xmm1,%%xmm3;"
417
0
        "mov    %3,%10;"
418
0
        "add    %12,%9;"
419
0
        "mov    %3,%12;"
420
0
        "pslld  $0x19,%%xmm1;"
421
0
        "or     %5,%10;"
422
0
        "add    %9,%6;"
423
0
        "and    %5,%12;"
424
0
        "psrld  $0x7,%%xmm2;"
425
0
        "and    %4,%10;"
426
0
        "add    %11,%9;"
427
0
        "por    %%xmm2,%%xmm1;"
428
0
        "or     %12,%10;"
429
0
        "add    %10,%9;"
430
0
        "movdqa %%xmm3,%%xmm2;"
431
0
        "mov    %6,%10;"
432
0
        "mov    %9,%11;"
433
0
        "movdqa %%xmm3,%%xmm8;"
434
0
        "ror    $0xe,%10;"
435
0
        "xor    %6,%10;"
436
0
        "mov    %k2,%12;"
437
0
        "ror    $0x9,%11;"
438
0
        "pslld  $0xe,%%xmm3;"
439
0
        "xor    %9,%11;"
440
0
        "ror    $0x5,%10;"
441
0
        "xor    %7,%12;"
442
0
        "psrld  $0x12,%%xmm2;"
443
0
        "ror    $0xb,%11;"
444
0
        "xor    %6,%10;"
445
0
        "and    %6,%12;"
446
0
        "ror    $0x6,%10;"
447
0
        "pxor   %%xmm3,%%xmm1;"
448
0
        "xor    %9,%11;"
449
0
        "xor    %7,%12;"
450
0
        "psrld  $0x3,%%xmm8;"
451
0
        "add    %10,%12;"
452
0
        "add    4+%16,%12;"
453
0
        "ror    $0x2,%11;"
454
0
        "pxor   %%xmm2,%%xmm1;"
455
0
        "mov    %9,%10;"
456
0
        "add    %12,%8;"
457
0
        "mov    %9,%12;"
458
0
        "pxor   %%xmm8,%%xmm1;"
459
0
        "or     %4,%10;"
460
0
        "add    %8,%5;"
461
0
        "and    %4,%12;"
462
0
        "pshufd $0xfa,%%xmm5,%%xmm2;"
463
0
        "and    %3,%10;"
464
0
        "add    %11,%8;"
465
0
        "paddd  %%xmm1,%%xmm0;"
466
0
        "or     %12,%10;"
467
0
        "add    %10,%8;"
468
0
        "movdqa %%xmm2,%%xmm3;"
469
0
        "mov    %5,%10;"
470
0
        "mov    %8,%11;"
471
0
        "ror    $0xe,%10;"
472
0
        "movdqa %%xmm2,%%xmm8;"
473
0
        "xor    %5,%10;"
474
0
        "ror    $0x9,%11;"
475
0
        "mov    %6,%12;"
476
0
        "xor    %8,%11;"
477
0
        "ror    $0x5,%10;"
478
0
        "psrlq  $0x11,%%xmm2;"
479
0
        "xor    %k2,%12;"
480
0
        "psrlq  $0x13,%%xmm3;"
481
0
        "xor    %5,%10;"
482
0
        "and    %5,%12;"
483
0
        "psrld  $0xa,%%xmm8;"
484
0
        "ror    $0xb,%11;"
485
0
        "xor    %8,%11;"
486
0
        "xor    %k2,%12;"
487
0
        "ror    $0x6,%10;"
488
0
        "pxor   %%xmm3,%%xmm2;"
489
0
        "add    %10,%12;"
490
0
        "ror    $0x2,%11;"
491
0
        "add    8+%16,%12;"
492
0
        "pxor   %%xmm2,%%xmm8;"
493
0
        "mov    %8,%10;"
494
0
        "add    %12,%7;"
495
0
        "mov    %8,%12;"
496
0
        "pshufb %%xmm10,%%xmm8;"
497
0
        "or     %3,%10;"
498
0
        "add    %7,%4;"
499
0
        "and    %3,%12;"
500
0
        "paddd  %%xmm8,%%xmm0;"
501
0
        "and    %9,%10;"
502
0
        "add    %11,%7;"
503
0
        "pshufd $0x50,%%xmm0,%%xmm2;"
504
0
        "or     %12,%10;"
505
0
        "add    %10,%7;"
506
0
        "movdqa %%xmm2,%%xmm3;"
507
0
        "mov    %4,%10;"
508
0
        "ror    $0xe,%10;"
509
0
        "mov    %7,%11;"
510
0
        "movdqa %%xmm2,%%xmm6;"
511
0
        "ror    $0x9,%11;"
512
0
        "xor    %4,%10;"
513
0
        "mov    %5,%12;"
514
0
        "ror    $0x5,%10;"
515
0
        "psrlq  $0x11,%%xmm2;"
516
0
        "xor    %7,%11;"
517
0
        "xor    %6,%12;"
518
0
        "psrlq  $0x13,%%xmm3;"
519
0
        "xor    %4,%10;"
520
0
        "and    %4,%12;"
521
0
        "ror    $0xb,%11;"
522
0
        "psrld  $0xa,%%xmm6;"
523
0
        "xor    %7,%11;"
524
0
        "ror    $0x6,%10;"
525
0
        "xor    %6,%12;"
526
0
        "pxor   %%xmm3,%%xmm2;"
527
0
        "ror    $0x2,%11;"
528
0
        "add    %10,%12;"
529
0
        "add    12+%16,%12;"
530
0
        "pxor   %%xmm2,%%xmm6;"
531
0
        "mov    %7,%10;"
532
0
        "add    %12,%k2;"
533
0
        "mov    %7,%12;"
534
0
        "pshufb %%xmm11,%%xmm6;"
535
0
        "or     %9,%10;"
536
0
        "add    %k2,%3;"
537
0
        "and    %9,%12;"
538
0
        "paddd  %%xmm0,%%xmm6;"
539
0
        "and    %8,%10;"
540
0
        "add    %11,%k2;"
541
0
        "or     %12,%10;"
542
0
        "add    %10,%k2;"
543
0
        "movdqa 0x30(%13),%%xmm9;"
544
0
        "paddd  %%xmm7,%%xmm9;"
545
0
        "movdqa %%xmm9,%16;"
546
0
        "add    $0x40,%13;"
547
0
        "movdqa %%xmm6,%%xmm0;"
548
0
        "mov    %3,%10;"
549
0
        "ror    $0xe,%10;"
550
0
        "mov    %k2,%11;"
551
0
        "palignr $0x4,%%xmm5,%%xmm0;"
552
0
        "ror    $0x9,%11;"
553
0
        "xor    %3,%10;"
554
0
        "mov    %4,%12;"
555
0
        "ror    $0x5,%10;"
556
0
        "movdqa %%xmm4,%%xmm1;"
557
0
        "xor    %k2,%11;"
558
0
        "xor    %5,%12;"
559
0
        "paddd  %%xmm7,%%xmm0;"
560
0
        "xor    %3,%10;"
561
0
        "and    %3,%12;"
562
0
        "ror    $0xb,%11;"
563
0
        "palignr $0x4,%%xmm7,%%xmm1;"
564
0
        "xor    %k2,%11;"
565
0
        "ror    $0x6,%10;"
566
0
        "xor    %5,%12;"
567
0
        "movdqa %%xmm1,%%xmm2;"
568
0
        "ror    $0x2,%11;"
569
0
        "add    %10,%12;"
570
0
        "add    %16,%12;"
571
0
        "movdqa %%xmm1,%%xmm3;"
572
0
        "mov    %k2,%10;"
573
0
        "add    %12,%6;"
574
0
        "mov    %k2,%12;"
575
0
        "pslld  $0x19,%%xmm1;"
576
0
        "or     %8,%10;"
577
0
        "add    %6,%9;"
578
0
        "and    %8,%12;"
579
0
        "psrld  $0x7,%%xmm2;"
580
0
        "and    %7,%10;"
581
0
        "add    %11,%6;"
582
0
        "por    %%xmm2,%%xmm1;"
583
0
        "or     %12,%10;"
584
0
        "add    %10,%6;"
585
0
        "movdqa %%xmm3,%%xmm2;"
586
0
        "mov    %9,%10;"
587
0
        "mov    %6,%11;"
588
0
        "movdqa %%xmm3,%%xmm8;"
589
0
        "ror    $0xe,%10;"
590
0
        "xor    %9,%10;"
591
0
        "mov    %3,%12;"
592
0
        "ror    $0x9,%11;"
593
0
        "pslld  $0xe,%%xmm3;"
594
0
        "xor    %6,%11;"
595
0
        "ror    $0x5,%10;"
596
0
        "xor    %4,%12;"
597
0
        "psrld  $0x12,%%xmm2;"
598
0
        "ror    $0xb,%11;"
599
0
        "xor    %9,%10;"
600
0
        "and    %9,%12;"
601
0
        "ror    $0x6,%10;"
602
0
        "pxor   %%xmm3,%%xmm1;"
603
0
        "xor    %6,%11;"
604
0
        "xor    %4,%12;"
605
0
        "psrld  $0x3,%%xmm8;"
606
0
        "add    %10,%12;"
607
0
        "add    4+%16,%12;"
608
0
        "ror    $0x2,%11;"
609
0
        "pxor   %%xmm2,%%xmm1;"
610
0
        "mov    %6,%10;"
611
0
        "add    %12,%5;"
612
0
        "mov    %6,%12;"
613
0
        "pxor   %%xmm8,%%xmm1;"
614
0
        "or     %7,%10;"
615
0
        "add    %5,%8;"
616
0
        "and    %7,%12;"
617
0
        "pshufd $0xfa,%%xmm6,%%xmm2;"
618
0
        "and    %k2,%10;"
619
0
        "add    %11,%5;"
620
0
        "paddd  %%xmm1,%%xmm0;"
621
0
        "or     %12,%10;"
622
0
        "add    %10,%5;"
623
0
        "movdqa %%xmm2,%%xmm3;"
624
0
        "mov    %8,%10;"
625
0
        "mov    %5,%11;"
626
0
        "ror    $0xe,%10;"
627
0
        "movdqa %%xmm2,%%xmm8;"
628
0
        "xor    %8,%10;"
629
0
        "ror    $0x9,%11;"
630
0
        "mov    %9,%12;"
631
0
        "xor    %5,%11;"
632
0
        "ror    $0x5,%10;"
633
0
        "psrlq  $0x11,%%xmm2;"
634
0
        "xor    %3,%12;"
635
0
        "psrlq  $0x13,%%xmm3;"
636
0
        "xor    %8,%10;"
637
0
        "and    %8,%12;"
638
0
        "psrld  $0xa,%%xmm8;"
639
0
        "ror    $0xb,%11;"
640
0
        "xor    %5,%11;"
641
0
        "xor    %3,%12;"
642
0
        "ror    $0x6,%10;"
643
0
        "pxor   %%xmm3,%%xmm2;"
644
0
        "add    %10,%12;"
645
0
        "ror    $0x2,%11;"
646
0
        "add    8+%16,%12;"
647
0
        "pxor   %%xmm2,%%xmm8;"
648
0
        "mov    %5,%10;"
649
0
        "add    %12,%4;"
650
0
        "mov    %5,%12;"
651
0
        "pshufb %%xmm10,%%xmm8;"
652
0
        "or     %k2,%10;"
653
0
        "add    %4,%7;"
654
0
        "and    %k2,%12;"
655
0
        "paddd  %%xmm8,%%xmm0;"
656
0
        "and    %6,%10;"
657
0
        "add    %11,%4;"
658
0
        "pshufd $0x50,%%xmm0,%%xmm2;"
659
0
        "or     %12,%10;"
660
0
        "add    %10,%4;"
661
0
        "movdqa %%xmm2,%%xmm3;"
662
0
        "mov    %7,%10;"
663
0
        "ror    $0xe,%10;"
664
0
        "mov    %4,%11;"
665
0
        "movdqa %%xmm2,%%xmm7;"
666
0
        "ror    $0x9,%11;"
667
0
        "xor    %7,%10;"
668
0
        "mov    %8,%12;"
669
0
        "ror    $0x5,%10;"
670
0
        "psrlq  $0x11,%%xmm2;"
671
0
        "xor    %4,%11;"
672
0
        "xor    %9,%12;"
673
0
        "psrlq  $0x13,%%xmm3;"
674
0
        "xor    %7,%10;"
675
0
        "and    %7,%12;"
676
0
        "ror    $0xb,%11;"
677
0
        "psrld  $0xa,%%xmm7;"
678
0
        "xor    %4,%11;"
679
0
        "ror    $0x6,%10;"
680
0
        "xor    %9,%12;"
681
0
        "pxor   %%xmm3,%%xmm2;"
682
0
        "ror    $0x2,%11;"
683
0
        "add    %10,%12;"
684
0
        "add    12+%16,%12;"
685
0
        "pxor   %%xmm2,%%xmm7;"
686
0
        "mov    %4,%10;"
687
0
        "add    %12,%3;"
688
0
        "mov    %4,%12;"
689
0
        "pshufb %%xmm11,%%xmm7;"
690
0
        "or     %6,%10;"
691
0
        "add    %3,%k2;"
692
0
        "and    %6,%12;"
693
0
        "paddd  %%xmm0,%%xmm7;"
694
0
        "and    %5,%10;"
695
0
        "add    %11,%3;"
696
0
        "or     %12,%10;"
697
0
        "add    %10,%3;"
698
0
        "sub    $0x1,%1;"
699
0
        "jne    Lloop1_%=;"
700
0
        "mov    $0x2,%1;"
701
702
0
        "Lloop2_%=:"
703
0
        "paddd  0x0(%13),%%xmm4;"
704
0
        "movdqa %%xmm4,%16;"
705
0
        "mov    %k2,%10;"
706
0
        "ror    $0xe,%10;"
707
0
        "mov    %3,%11;"
708
0
        "xor    %k2,%10;"
709
0
        "ror    $0x9,%11;"
710
0
        "mov    %7,%12;"
711
0
        "xor    %3,%11;"
712
0
        "ror    $0x5,%10;"
713
0
        "xor    %8,%12;"
714
0
        "xor    %k2,%10;"
715
0
        "ror    $0xb,%11;"
716
0
        "and    %k2,%12;"
717
0
        "xor    %3,%11;"
718
0
        "ror    $0x6,%10;"
719
0
        "xor    %8,%12;"
720
0
        "add    %10,%12;"
721
0
        "ror    $0x2,%11;"
722
0
        "add    %16,%12;"
723
0
        "mov    %3,%10;"
724
0
        "add    %12,%9;"
725
0
        "mov    %3,%12;"
726
0
        "or     %5,%10;"
727
0
        "add    %9,%6;"
728
0
        "and    %5,%12;"
729
0
        "and    %4,%10;"
730
0
        "add    %11,%9;"
731
0
        "or     %12,%10;"
732
0
        "add    %10,%9;"
733
0
        "mov    %6,%10;"
734
0
        "ror    $0xe,%10;"
735
0
        "mov    %9,%11;"
736
0
        "xor    %6,%10;"
737
0
        "ror    $0x9,%11;"
738
0
        "mov    %k2,%12;"
739
0
        "xor    %9,%11;"
740
0
        "ror    $0x5,%10;"
741
0
        "xor    %7,%12;"
742
0
        "xor    %6,%10;"
743
0
        "ror    $0xb,%11;"
744
0
        "and    %6,%12;"
745
0
        "xor    %9,%11;"
746
0
        "ror    $0x6,%10;"
747
0
        "xor    %7,%12;"
748
0
        "add    %10,%12;"
749
0
        "ror    $0x2,%11;"
750
0
        "add    4+%16,%12;"
751
0
        "mov    %9,%10;"
752
0
        "add    %12,%8;"
753
0
        "mov    %9,%12;"
754
0
        "or     %4,%10;"
755
0
        "add    %8,%5;"
756
0
        "and    %4,%12;"
757
0
        "and    %3,%10;"
758
0
        "add    %11,%8;"
759
0
        "or     %12,%10;"
760
0
        "add    %10,%8;"
761
0
        "mov    %5,%10;"
762
0
        "ror    $0xe,%10;"
763
0
        "mov    %8,%11;"
764
0
        "xor    %5,%10;"
765
0
        "ror    $0x9,%11;"
766
0
        "mov    %6,%12;"
767
0
        "xor    %8,%11;"
768
0
        "ror    $0x5,%10;"
769
0
        "xor    %k2,%12;"
770
0
        "xor    %5,%10;"
771
0
        "ror    $0xb,%11;"
772
0
        "and    %5,%12;"
773
0
        "xor    %8,%11;"
774
0
        "ror    $0x6,%10;"
775
0
        "xor    %k2,%12;"
776
0
        "add    %10,%12;"
777
0
        "ror    $0x2,%11;"
778
0
        "add    8+%16,%12;"
779
0
        "mov    %8,%10;"
780
0
        "add    %12,%7;"
781
0
        "mov    %8,%12;"
782
0
        "or     %3,%10;"
783
0
        "add    %7,%4;"
784
0
        "and    %3,%12;"
785
0
        "and    %9,%10;"
786
0
        "add    %11,%7;"
787
0
        "or     %12,%10;"
788
0
        "add    %10,%7;"
789
0
        "mov    %4,%10;"
790
0
        "ror    $0xe,%10;"
791
0
        "mov    %7,%11;"
792
0
        "xor    %4,%10;"
793
0
        "ror    $0x9,%11;"
794
0
        "mov    %5,%12;"
795
0
        "xor    %7,%11;"
796
0
        "ror    $0x5,%10;"
797
0
        "xor    %6,%12;"
798
0
        "xor    %4,%10;"
799
0
        "ror    $0xb,%11;"
800
0
        "and    %4,%12;"
801
0
        "xor    %7,%11;"
802
0
        "ror    $0x6,%10;"
803
0
        "xor    %6,%12;"
804
0
        "add    %10,%12;"
805
0
        "ror    $0x2,%11;"
806
0
        "add    12+%16,%12;"
807
0
        "mov    %7,%10;"
808
0
        "add    %12,%k2;"
809
0
        "mov    %7,%12;"
810
0
        "or     %9,%10;"
811
0
        "add    %k2,%3;"
812
0
        "and    %9,%12;"
813
0
        "and    %8,%10;"
814
0
        "add    %11,%k2;"
815
0
        "or     %12,%10;"
816
0
        "add    %10,%k2;"
817
0
        "paddd  0x10(%13),%%xmm5;"
818
0
        "movdqa %%xmm5,%16;"
819
0
        "add    $0x20,%13;"
820
0
        "mov    %3,%10;"
821
0
        "ror    $0xe,%10;"
822
0
        "mov    %k2,%11;"
823
0
        "xor    %3,%10;"
824
0
        "ror    $0x9,%11;"
825
0
        "mov    %4,%12;"
826
0
        "xor    %k2,%11;"
827
0
        "ror    $0x5,%10;"
828
0
        "xor    %5,%12;"
829
0
        "xor    %3,%10;"
830
0
        "ror    $0xb,%11;"
831
0
        "and    %3,%12;"
832
0
        "xor    %k2,%11;"
833
0
        "ror    $0x6,%10;"
834
0
        "xor    %5,%12;"
835
0
        "add    %10,%12;"
836
0
        "ror    $0x2,%11;"
837
0
        "add    %16,%12;"
838
0
        "mov    %k2,%10;"
839
0
        "add    %12,%6;"
840
0
        "mov    %k2,%12;"
841
0
        "or     %8,%10;"
842
0
        "add    %6,%9;"
843
0
        "and    %8,%12;"
844
0
        "and    %7,%10;"
845
0
        "add    %11,%6;"
846
0
        "or     %12,%10;"
847
0
        "add    %10,%6;"
848
0
        "mov    %9,%10;"
849
0
        "ror    $0xe,%10;"
850
0
        "mov    %6,%11;"
851
0
        "xor    %9,%10;"
852
0
        "ror    $0x9,%11;"
853
0
        "mov    %3,%12;"
854
0
        "xor    %6,%11;"
855
0
        "ror    $0x5,%10;"
856
0
        "xor    %4,%12;"
857
0
        "xor    %9,%10;"
858
0
        "ror    $0xb,%11;"
859
0
        "and    %9,%12;"
860
0
        "xor    %6,%11;"
861
0
        "ror    $0x6,%10;"
862
0
        "xor    %4,%12;"
863
0
        "add    %10,%12;"
864
0
        "ror    $0x2,%11;"
865
0
        "add    4+%16,%12;"
866
0
        "mov    %6,%10;"
867
0
        "add    %12,%5;"
868
0
        "mov    %6,%12;"
869
0
        "or     %7,%10;"
870
0
        "add    %5,%8;"
871
0
        "and    %7,%12;"
872
0
        "and    %k2,%10;"
873
0
        "add    %11,%5;"
874
0
        "or     %12,%10;"
875
0
        "add    %10,%5;"
876
0
        "mov    %8,%10;"
877
0
        "ror    $0xe,%10;"
878
0
        "mov    %5,%11;"
879
0
        "xor    %8,%10;"
880
0
        "ror    $0x9,%11;"
881
0
        "mov    %9,%12;"
882
0
        "xor    %5,%11;"
883
0
        "ror    $0x5,%10;"
884
0
        "xor    %3,%12;"
885
0
        "xor    %8,%10;"
886
0
        "ror    $0xb,%11;"
887
0
        "and    %8,%12;"
888
0
        "xor    %5,%11;"
889
0
        "ror    $0x6,%10;"
890
0
        "xor    %3,%12;"
891
0
        "add    %10,%12;"
892
0
        "ror    $0x2,%11;"
893
0
        "add    8+%16,%12;"
894
0
        "mov    %5,%10;"
895
0
        "add    %12,%4;"
896
0
        "mov    %5,%12;"
897
0
        "or     %k2,%10;"
898
0
        "add    %4,%7;"
899
0
        "and    %k2,%12;"
900
0
        "and    %6,%10;"
901
0
        "add    %11,%4;"
902
0
        "or     %12,%10;"
903
0
        "add    %10,%4;"
904
0
        "mov    %7,%10;"
905
0
        "ror    $0xe,%10;"
906
0
        "mov    %4,%11;"
907
0
        "xor    %7,%10;"
908
0
        "ror    $0x9,%11;"
909
0
        "mov    %8,%12;"
910
0
        "xor    %4,%11;"
911
0
        "ror    $0x5,%10;"
912
0
        "xor    %9,%12;"
913
0
        "xor    %7,%10;"
914
0
        "ror    $0xb,%11;"
915
0
        "and    %7,%12;"
916
0
        "xor    %4,%11;"
917
0
        "ror    $0x6,%10;"
918
0
        "xor    %9,%12;"
919
0
        "add    %10,%12;"
920
0
        "ror    $0x2,%11;"
921
0
        "add    12+%16,%12;"
922
0
        "mov    %4,%10;"
923
0
        "add    %12,%3;"
924
0
        "mov    %4,%12;"
925
0
        "or     %6,%10;"
926
0
        "add    %3,%k2;"
927
0
        "and    %6,%12;"
928
0
        "and    %5,%10;"
929
0
        "add    %11,%3;"
930
0
        "or     %12,%10;"
931
0
        "add    %10,%3;"
932
0
        "movdqa %%xmm6,%%xmm4;"
933
0
        "movdqa %%xmm7,%%xmm5;"
934
0
        "sub    $0x1,%1;"
935
0
        "jne    Lloop2_%=;"
936
0
        "add    (%0),%3;"
937
0
        "mov    %3,(%0);"
938
0
        "add    0x4(%0),%4;"
939
0
        "mov    %4,0x4(%0);"
940
0
        "add    0x8(%0),%5;"
941
0
        "mov    %5,0x8(%0);"
942
0
        "add    0xc(%0),%6;"
943
0
        "mov    %6,0xc(%0);"
944
0
        "add    0x10(%0),%k2;"
945
0
        "mov    %k2,0x10(%0);"
946
0
        "add    0x14(%0),%7;"
947
0
        "mov    %7,0x14(%0);"
948
0
        "add    0x18(%0),%8;"
949
0
        "mov    %8,0x18(%0);"
950
0
        "add    0x1c(%0),%9;"
951
0
        "mov    %9,0x1c(%0);"
952
0
        "mov    %15,%1;"
953
0
        "add    $0x40,%1;"
954
0
        "cmp    %14,%1;"
955
0
        "jne    Lloop0_%=;"
956
957
0
        "Ldone_hash_%=:"
958
959
0
        : "+r"(s), "+r"(chunk), "+r"(blocks), "=r"(a), "=r"(b), "=r"(c), "=r"(d), /* e = chunk */ "=r"(f), "=r"(g), "=r"(h), "=r"(y0), "=r"(y1), "=r"(y2), "=r"(tbl), "+m"(inp_end), "+m"(inp), "+m"(xfer)
960
0
        : "m"(K256), "m"(FLIP_MASK), "m"(SHUF_00BA), "m"(SHUF_DC00)
961
0
        : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12"
962
0
   );
963
0
}
964
}
965
966
/*
967
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
968
; Copyright (c) 2012, Intel Corporation 
969
; 
970
; All rights reserved. 
971
; 
972
; Redistribution and use in source and binary forms, with or without
973
; modification, are permitted provided that the following conditions are
974
; met: 
975
; 
976
; * Redistributions of source code must retain the above copyright
977
;   notice, this list of conditions and the following disclaimer.  
978
; 
979
; * Redistributions in binary form must reproduce the above copyright
980
;   notice, this list of conditions and the following disclaimer in the
981
;   documentation and/or other materials provided with the
982
;   distribution. 
983
; 
984
; * Neither the name of the Intel Corporation nor the names of its
985
;   contributors may be used to endorse or promote products derived from
986
;   this software without specific prior written permission. 
987
; 
988
; 
989
; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
990
; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
991
; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
992
; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
993
; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
994
; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
995
; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
996
; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
997
; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
998
; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
999
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1000
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1001
;
1002
; Example YASM command lines:
1003
; Windows:  yasm -Xvc -f x64 -rnasm -pnasm -o sha256_sse4.obj -g cv8 sha256_sse4.asm
1004
; Linux:    yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_sse4.o sha256_sse4.asm
1005
;
1006
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1007
;
1008
; This code is described in an Intel White-Paper:
1009
; "Fast SHA-256 Implementations on Intel Architecture Processors"
1010
;
1011
; To find it, surf to https://www.intel.com/p/en_US/embedded
1012
; and search for that title.
1013
; The paper is expected to be released roughly at the end of April, 2012
1014
;
1015
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1016
; This code schedules 1 blocks at a time, with 4 lanes per block
1017
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1018
1019
%define MOVDQ movdqu ;; assume buffers not aligned 
1020
1021
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
1022
1023
; addm [mem], reg
1024
; Add reg to mem using reg-mem add and store
1025
%macro addm 2
1026
    add %2, %1
1027
    mov %1, %2
1028
%endm
1029
1030
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1031
1032
; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
1033
; Load xmm with mem and byte swap each dword
1034
%macro COPY_XMM_AND_BSWAP 3
1035
    MOVDQ %1, %2
1036
    pshufb %1, %3
1037
%endmacro
1038
1039
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1040
1041
%define X0 xmm4
1042
%define X1 xmm5
1043
%define X2 xmm6
1044
%define X3 xmm7
1045
1046
%define XTMP0 xmm0
1047
%define XTMP1 xmm1
1048
%define XTMP2 xmm2
1049
%define XTMP3 xmm3
1050
%define XTMP4 xmm8
1051
%define XFER  xmm9
1052
1053
%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
1054
%define SHUF_DC00 xmm11 ; shuffle xDxC -> DC00
1055
%define BYTE_FLIP_MASK  xmm12
1056
    
1057
%ifdef LINUX
1058
%define NUM_BLKS rdx  ; 3rd arg
1059
%define CTX rsi ; 2nd arg
1060
%define INP rdi ; 1st arg
1061
1062
%define SRND  rdi ; clobbers INP
1063
%define c ecx
1064
%define d   r8d
1065
%define e   edx
1066
%else
1067
%define NUM_BLKS r8 ; 3rd arg
1068
%define CTX rdx   ; 2nd arg
1069
%define INP rcx   ; 1st arg
1070
1071
%define SRND  rcx ; clobbers INP
1072
%define c   edi 
1073
%define d esi 
1074
%define e   r8d
1075
    
1076
%endif
1077
%define TBL rbp
1078
%define a eax
1079
%define b ebx
1080
1081
%define f r9d
1082
%define g r10d
1083
%define h r11d
1084
1085
%define y0 r13d
1086
%define y1 r14d
1087
%define y2 r15d
1088
1089
1090
1091
_INP_END_SIZE equ 8
1092
_INP_SIZE equ 8
1093
_XFER_SIZE  equ 8
1094
%ifdef LINUX
1095
_XMM_SAVE_SIZE  equ 0
1096
%else
1097
_XMM_SAVE_SIZE  equ 7*16
1098
%endif
1099
; STACK_SIZE plus pushes must be an odd multiple of 8
1100
_ALIGN_SIZE equ 8
1101
1102
_INP_END  equ 0
1103
_INP    equ _INP_END  + _INP_END_SIZE
1104
_XFER   equ _INP      + _INP_SIZE
1105
_XMM_SAVE equ _XFER     + _XFER_SIZE + _ALIGN_SIZE
1106
STACK_SIZE  equ _XMM_SAVE + _XMM_SAVE_SIZE
1107
1108
; rotate_Xs
1109
; Rotate values of symbols X0...X3
1110
%macro rotate_Xs 0
1111
%xdefine X_ X0
1112
%xdefine X0 X1
1113
%xdefine X1 X2
1114
%xdefine X2 X3
1115
%xdefine X3 X_
1116
%endm
1117
1118
; ROTATE_ARGS
1119
; Rotate values of symbols a...h
1120
%macro ROTATE_ARGS 0
1121
%xdefine TMP_ h
1122
%xdefine h g
1123
%xdefine g f
1124
%xdefine f e
1125
%xdefine e d
1126
%xdefine d c
1127
%xdefine c b
1128
%xdefine b a
1129
%xdefine a TMP_
1130
%endm
1131
1132
%macro FOUR_ROUNDS_AND_SCHED 0
1133
  ;; compute s0 four at a time and s1 two at a time
1134
  ;; compute W[-16] + W[-7] 4 at a time
1135
  movdqa  XTMP0, X3
1136
    mov y0, e   ; y0 = e
1137
    ror y0, (25-11) ; y0 = e >> (25-11)
1138
    mov y1, a   ; y1 = a
1139
  palignr XTMP0, X2, 4  ; XTMP0 = W[-7]
1140
    ror y1, (22-13) ; y1 = a >> (22-13)
1141
    xor y0, e   ; y0 = e ^ (e >> (25-11))
1142
    mov y2, f   ; y2 = f
1143
    ror y0, (11-6)  ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1144
  movdqa  XTMP1, X1
1145
    xor y1, a   ; y1 = a ^ (a >> (22-13)
1146
    xor y2, g   ; y2 = f^g
1147
  paddd XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
1148
    xor y0, e   ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1149
    and y2, e   ; y2 = (f^g)&e
1150
    ror y1, (13-2)  ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1151
  ;; compute s0
1152
  palignr XTMP1, X0, 4  ; XTMP1 = W[-15]
1153
    xor y1, a   ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1154
    ror y0, 6   ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1155
    xor y2, g   ; y2 = CH = ((f^g)&e)^g
1156
  movdqa  XTMP2, XTMP1  ; XTMP2 = W[-15]
1157
    ror y1, 2   ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1158
    add y2, y0    ; y2 = S1 + CH
1159
    add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
1160
  movdqa  XTMP3, XTMP1  ; XTMP3 = W[-15]
1161
    mov y0, a   ; y0 = a
1162
    add h, y2   ; h = h + S1 + CH + k + w
1163
    mov y2, a   ; y2 = a
1164
  pslld XTMP1, (32-7)
1165
    or  y0, c   ; y0 = a|c
1166
    add d, h    ; d = d + h + S1 + CH + k + w
1167
    and y2, c   ; y2 = a&c
1168
  psrld XTMP2, 7
1169
    and y0, b   ; y0 = (a|c)&b
1170
    add h, y1   ; h = h + S1 + CH + k + w + S0
1171
  por XTMP1, XTMP2  ; XTMP1 = W[-15] ror 7
1172
    or  y0, y2    ; y0 = MAJ = (a|c)&b)|(a&c)
1173
    add h, y0   ; h = h + S1 + CH + k + w + S0 + MAJ
1174
1175
ROTATE_ARGS
1176
  movdqa  XTMP2, XTMP3  ; XTMP2 = W[-15]
1177
    mov y0, e   ; y0 = e
1178
    mov y1, a   ; y1 = a
1179
  movdqa  XTMP4, XTMP3  ; XTMP4 = W[-15]
1180
    ror y0, (25-11) ; y0 = e >> (25-11)
1181
    xor y0, e   ; y0 = e ^ (e >> (25-11))
1182
    mov y2, f   ; y2 = f
1183
    ror y1, (22-13) ; y1 = a >> (22-13)
1184
  pslld XTMP3, (32-18)
1185
    xor y1, a   ; y1 = a ^ (a >> (22-13)
1186
    ror y0, (11-6)  ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1187
    xor y2, g   ; y2 = f^g
1188
  psrld XTMP2, 18
1189
    ror y1, (13-2)  ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1190
    xor y0, e   ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1191
    and y2, e   ; y2 = (f^g)&e
1192
    ror y0, 6   ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1193
  pxor  XTMP1, XTMP3
1194
    xor y1, a   ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1195
    xor y2, g   ; y2 = CH = ((f^g)&e)^g
1196
  psrld XTMP4, 3  ; XTMP4 = W[-15] >> 3
1197
    add y2, y0    ; y2 = S1 + CH
1198
    add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
1199
    ror y1, 2   ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1200
  pxor  XTMP1, XTMP2  ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
1201
    mov y0, a   ; y0 = a
1202
    add h, y2   ; h = h + S1 + CH + k + w
1203
    mov y2, a   ; y2 = a
1204
  pxor  XTMP1, XTMP4  ; XTMP1 = s0
1205
    or  y0, c   ; y0 = a|c
1206
    add d, h    ; d = d + h + S1 + CH + k + w
1207
    and y2, c   ; y2 = a&c
1208
  ;; compute low s1
1209
  pshufd  XTMP2, X3, 11111010b  ; XTMP2 = W[-2] {BBAA}
1210
    and y0, b   ; y0 = (a|c)&b
1211
    add h, y1   ; h = h + S1 + CH + k + w + S0
1212
  paddd XTMP0, XTMP1  ; XTMP0 = W[-16] + W[-7] + s0
1213
    or  y0, y2    ; y0 = MAJ = (a|c)&b)|(a&c)
1214
    add h, y0   ; h = h + S1 + CH + k + w + S0 + MAJ
1215
1216
ROTATE_ARGS
1217
  movdqa  XTMP3, XTMP2  ; XTMP3 = W[-2] {BBAA}
1218
    mov y0, e   ; y0 = e
1219
    mov y1, a   ; y1 = a
1220
    ror y0, (25-11) ; y0 = e >> (25-11)
1221
  movdqa  XTMP4, XTMP2  ; XTMP4 = W[-2] {BBAA}
1222
    xor y0, e   ; y0 = e ^ (e >> (25-11))
1223
    ror y1, (22-13) ; y1 = a >> (22-13)
1224
    mov y2, f   ; y2 = f
1225
    xor y1, a   ; y1 = a ^ (a >> (22-13)
1226
    ror y0, (11-6)  ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1227
  psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA}
1228
    xor y2, g   ; y2 = f^g
1229
  psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xBxA}
1230
    xor y0, e   ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1231
    and y2, e   ; y2 = (f^g)&e
1232
  psrld XTMP4, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
1233
    ror y1, (13-2)  ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1234
    xor y1, a   ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1235
    xor y2, g   ; y2 = CH = ((f^g)&e)^g
1236
    ror y0, 6   ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1237
  pxor  XTMP2, XTMP3
1238
    add y2, y0    ; y2 = S1 + CH
1239
    ror y1, 2   ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1240
    add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
1241
  pxor  XTMP4, XTMP2  ; XTMP4 = s1 {xBxA}
1242
    mov y0, a   ; y0 = a
1243
    add h, y2   ; h = h + S1 + CH + k + w
1244
    mov y2, a   ; y2 = a
1245
  pshufb  XTMP4, SHUF_00BA  ; XTMP4 = s1 {00BA}
1246
    or  y0, c   ; y0 = a|c
1247
    add d, h    ; d = d + h + S1 + CH + k + w
1248
    and y2, c   ; y2 = a&c
1249
  paddd XTMP0, XTMP4  ; XTMP0 = {..., ..., W[1], W[0]}
1250
    and y0, b   ; y0 = (a|c)&b
1251
    add h, y1   ; h = h + S1 + CH + k + w + S0
1252
  ;; compute high s1
1253
  pshufd  XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
1254
    or  y0, y2    ; y0 = MAJ = (a|c)&b)|(a&c)
1255
    add h, y0   ; h = h + S1 + CH + k + w + S0 + MAJ
1256
1257
ROTATE_ARGS
1258
  movdqa  XTMP3, XTMP2  ; XTMP3 = W[-2] {DDCC}
1259
    mov y0, e   ; y0 = e
1260
    ror y0, (25-11) ; y0 = e >> (25-11)
1261
    mov y1, a   ; y1 = a
1262
  movdqa  X0,    XTMP2  ; X0    = W[-2] {DDCC}
1263
    ror y1, (22-13) ; y1 = a >> (22-13)
1264
    xor y0, e   ; y0 = e ^ (e >> (25-11))
1265
    mov y2, f   ; y2 = f
1266
    ror y0, (11-6)  ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1267
  psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC}
1268
    xor y1, a   ; y1 = a ^ (a >> (22-13)
1269
    xor y2, g   ; y2 = f^g
1270
  psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xDxC}
1271
    xor y0, e   ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1272
    and y2, e   ; y2 = (f^g)&e
1273
    ror y1, (13-2)  ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1274
  psrld X0,    10 ; X0 = W[-2] >> 10 {DDCC}
1275
    xor y1, a   ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1276
    ror y0, 6   ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1277
    xor y2, g   ; y2 = CH = ((f^g)&e)^g
1278
  pxor  XTMP2, XTMP3
1279
    ror y1, 2   ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1280
    add y2, y0    ; y2 = S1 + CH
1281
    add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
1282
  pxor  X0, XTMP2 ; X0 = s1 {xDxC}
1283
    mov y0, a   ; y0 = a
1284
    add h, y2   ; h = h + S1 + CH + k + w
1285
    mov y2, a   ; y2 = a
1286
  pshufb  X0, SHUF_DC00 ; X0 = s1 {DC00}
1287
    or  y0, c   ; y0 = a|c
1288
    add d, h    ; d = d + h + S1 + CH + k + w
1289
    and y2, c   ; y2 = a&c
1290
  paddd X0, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
1291
    and y0, b   ; y0 = (a|c)&b
1292
    add h, y1   ; h = h + S1 + CH + k + w + S0
1293
    or  y0, y2    ; y0 = MAJ = (a|c)&b)|(a&c)
1294
    add h, y0   ; h = h + S1 + CH + k + w + S0 + MAJ
1295
1296
ROTATE_ARGS
1297
rotate_Xs
1298
%endm
1299
1300
;; input is [rsp + _XFER + %1 * 4]
1301
%macro DO_ROUND 1
1302
    mov y0, e   ; y0 = e
1303
    ror y0, (25-11) ; y0 = e >> (25-11)
1304
    mov y1, a   ; y1 = a
1305
    xor y0, e   ; y0 = e ^ (e >> (25-11))
1306
    ror y1, (22-13) ; y1 = a >> (22-13)
1307
    mov y2, f   ; y2 = f
1308
    xor y1, a   ; y1 = a ^ (a >> (22-13)
1309
    ror y0, (11-6)  ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1310
    xor y2, g   ; y2 = f^g
1311
    xor y0, e   ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1312
    ror y1, (13-2)  ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1313
    and y2, e   ; y2 = (f^g)&e
1314
    xor y1, a   ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1315
    ror y0, 6   ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1316
    xor y2, g   ; y2 = CH = ((f^g)&e)^g
1317
    add y2, y0    ; y2 = S1 + CH
1318
    ror y1, 2   ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1319
    add y2, [rsp + _XFER + %1 * 4]  ; y2 = k + w + S1 + CH
1320
    mov y0, a   ; y0 = a
1321
    add h, y2   ; h = h + S1 + CH + k + w
1322
    mov y2, a   ; y2 = a
1323
    or  y0, c   ; y0 = a|c
1324
    add d, h    ; d = d + h + S1 + CH + k + w
1325
    and y2, c   ; y2 = a&c
1326
    and y0, b   ; y0 = (a|c)&b
1327
    add h, y1   ; h = h + S1 + CH + k + w + S0
1328
    or  y0, y2    ; y0 = MAJ = (a|c)&b)|(a&c)
1329
    add h, y0   ; h = h + S1 + CH + k + w + S0 + MAJ
1330
    ROTATE_ARGS
1331
%endm
1332
1333
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1334
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1335
;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks)
1336
;; arg 1 : pointer to input data
1337
;; arg 2 : pointer to digest
1338
;; arg 3 : Num blocks
1339
section .text
1340
global sha256_sse4
1341
align 32
1342
sha256_sse4:
1343
    push  rbx
1344
%ifndef LINUX
1345
    push  rsi
1346
    push  rdi
1347
%endif
1348
    push  rbp
1349
    push  r13
1350
    push  r14
1351
    push  r15
1352
1353
    sub rsp,STACK_SIZE
1354
%ifndef LINUX
1355
    movdqa  [rsp + _XMM_SAVE + 0*16],xmm6 
1356
    movdqa  [rsp + _XMM_SAVE + 1*16],xmm7
1357
    movdqa  [rsp + _XMM_SAVE + 2*16],xmm8 
1358
    movdqa  [rsp + _XMM_SAVE + 3*16],xmm9 
1359
    movdqa  [rsp + _XMM_SAVE + 4*16],xmm10
1360
    movdqa  [rsp + _XMM_SAVE + 5*16],xmm11
1361
    movdqa  [rsp + _XMM_SAVE + 6*16],xmm12
1362
%endif
1363
1364
    shl NUM_BLKS, 6 ; convert to bytes
1365
    jz  done_hash
1366
    add NUM_BLKS, INP ; pointer to end of data
1367
    mov [rsp + _INP_END], NUM_BLKS
1368
1369
    ;; load initial digest
1370
    mov a,[4*0 + CTX]
1371
    mov b,[4*1 + CTX]
1372
    mov c,[4*2 + CTX]
1373
    mov d,[4*3 + CTX]
1374
    mov e,[4*4 + CTX]
1375
    mov f,[4*5 + CTX]
1376
    mov g,[4*6 + CTX]
1377
    mov h,[4*7 + CTX]
1378
1379
    movdqa  BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
1380
    movdqa  SHUF_00BA, [_SHUF_00BA wrt rip]
1381
    movdqa  SHUF_DC00, [_SHUF_DC00 wrt rip]
1382
1383
loop0:
1384
    lea TBL,[K256 wrt rip]
1385
1386
    ;; byte swap first 16 dwords
1387
    COPY_XMM_AND_BSWAP  X0, [INP + 0*16], BYTE_FLIP_MASK
1388
    COPY_XMM_AND_BSWAP  X1, [INP + 1*16], BYTE_FLIP_MASK
1389
    COPY_XMM_AND_BSWAP  X2, [INP + 2*16], BYTE_FLIP_MASK
1390
    COPY_XMM_AND_BSWAP  X3, [INP + 3*16], BYTE_FLIP_MASK
1391
    
1392
    mov [rsp + _INP], INP
1393
1394
    ;; schedule 48 input dwords, by doing 3 rounds of 16 each
1395
    mov SRND, 3
1396
align 16
1397
loop1:
1398
    movdqa  XFER, [TBL + 0*16]
1399
    paddd XFER, X0
1400
    movdqa  [rsp + _XFER], XFER
1401
    FOUR_ROUNDS_AND_SCHED
1402
1403
    movdqa  XFER, [TBL + 1*16]
1404
    paddd XFER, X0
1405
    movdqa  [rsp + _XFER], XFER
1406
    FOUR_ROUNDS_AND_SCHED
1407
1408
    movdqa  XFER, [TBL + 2*16]
1409
    paddd XFER, X0
1410
    movdqa  [rsp + _XFER], XFER
1411
    FOUR_ROUNDS_AND_SCHED
1412
1413
    movdqa  XFER, [TBL + 3*16]
1414
    paddd XFER, X0
1415
    movdqa  [rsp + _XFER], XFER
1416
    add TBL, 4*16
1417
    FOUR_ROUNDS_AND_SCHED
1418
1419
    sub SRND, 1
1420
    jne loop1
1421
1422
    mov SRND, 2
1423
loop2:
1424
    paddd X0, [TBL + 0*16]
1425
    movdqa  [rsp + _XFER], X0
1426
    DO_ROUND  0
1427
    DO_ROUND  1
1428
    DO_ROUND  2
1429
    DO_ROUND  3
1430
    paddd X1, [TBL + 1*16]
1431
    movdqa  [rsp + _XFER], X1
1432
    add TBL, 2*16
1433
    DO_ROUND  0
1434
    DO_ROUND  1
1435
    DO_ROUND  2
1436
    DO_ROUND  3
1437
1438
    movdqa  X0, X2
1439
    movdqa  X1, X3
1440
1441
    sub SRND, 1
1442
    jne loop2
1443
1444
    addm  [4*0 + CTX],a
1445
    addm  [4*1 + CTX],b
1446
    addm  [4*2 + CTX],c
1447
    addm  [4*3 + CTX],d
1448
    addm  [4*4 + CTX],e
1449
    addm  [4*5 + CTX],f
1450
    addm  [4*6 + CTX],g
1451
    addm  [4*7 + CTX],h
1452
1453
    mov INP, [rsp + _INP]
1454
    add INP, 64
1455
    cmp INP, [rsp + _INP_END]
1456
    jne loop0
1457
1458
done_hash:
1459
%ifndef LINUX
1460
    movdqa  xmm6,[rsp + _XMM_SAVE + 0*16]
1461
    movdqa  xmm7,[rsp + _XMM_SAVE + 1*16]
1462
    movdqa  xmm8,[rsp + _XMM_SAVE + 2*16]
1463
    movdqa  xmm9,[rsp + _XMM_SAVE + 3*16]
1464
    movdqa  xmm10,[rsp + _XMM_SAVE + 4*16]
1465
    movdqa  xmm11,[rsp + _XMM_SAVE + 5*16]
1466
    movdqa  xmm12,[rsp + _XMM_SAVE + 6*16]
1467
%endif
1468
1469
    add rsp, STACK_SIZE
1470
1471
    pop r15
1472
    pop r14
1473
    pop r13
1474
    pop rbp
1475
%ifndef LINUX
1476
    pop rdi
1477
    pop rsi
1478
%endif
1479
    pop rbx
1480
1481
    ret 
1482
    
1483
1484
section .data
1485
align 64
1486
K256:
1487
    dd  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1488
    dd  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1489
    dd  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1490
    dd  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1491
    dd  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1492
    dd  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1493
    dd  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1494
    dd  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1495
    dd  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1496
    dd  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1497
    dd  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1498
    dd  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1499
    dd  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1500
    dd  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1501
    dd  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1502
    dd  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1503
1504
PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
1505
1506
; shuffle xBxA -> 00BA
1507
_SHUF_00BA:              ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
1508
1509
; shuffle xDxC -> DC00
1510
_SHUF_DC00:              ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
1511
*/
1512
1513
#endif