Coverage Report

Created: 2026-06-12 16:53

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/root/bitcoin/src/crypto/sha256_sse4.cpp
Line
Count
Source
1
// Copyright (c) 2017-present The Bitcoin Core developers
2
// Distributed under the MIT software license, see the accompanying
3
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
4
//
5
// This is a translation to GCC extended asm syntax from YASM code by Intel
6
// (available at the bottom of this file).
7
8
#if defined(__x86_64__) || defined(__amd64__)
9
10
#include <cstdint>
11
#include <cstdlib>
12
13
namespace sha256_sse4
14
{
15
/*
16
Both Clang and GCC fail with ASan on this inline assembly:
17
- Clang: compile failure with -O0 or -O2 + -fcf-protection under ASan.
18
  See https://github.com/llvm/llvm-project/issues/92182
19
  and https://github.com/bitcoin/bitcoin/issues/31913.
20
- GCC: runtime SEGV during SHA256AutoDetect()'s self-test under ASan,
21
  regardless of optimization level.
22
  See https://github.com/bitcoin/bitcoin/issues/34881.
23
*/
24
#if defined(__SANITIZE_ADDRESS__)
25
  __attribute__((no_sanitize("address")))
26
#elif defined(__clang__)
27
#if __has_feature(address_sanitizer) // fallback can be removed once support for Clang 21 is dropped
28
  __attribute__((no_sanitize("address")))
29
#endif
30
#endif
31
void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks)
32
0
{
33
0
    static const uint32_t K256 alignas(16) [] = {
34
0
        0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
35
0
        0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
36
0
        0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
37
0
        0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
38
0
        0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
39
0
        0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
40
0
        0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
41
0
        0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
42
0
        0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
43
0
        0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
44
0
        0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
45
0
        0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
46
0
        0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
47
0
        0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
48
0
        0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
49
0
        0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
50
0
    };
51
0
    static const uint32_t FLIP_MASK alignas(16) [] = {0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f};
52
0
    static const uint32_t SHUF_00BA alignas(16) [] = {0x03020100, 0x0b0a0908, 0xffffffff, 0xffffffff};
53
0
    static const uint32_t SHUF_DC00 alignas(16) [] = {0xffffffff, 0xffffffff, 0x03020100, 0x0b0a0908};
54
0
    uint32_t a, b, c, d, f, g, h, y0, y1, y2;
55
0
    uint64_t tbl;
56
0
    uint64_t inp_end, inp;
57
0
    uint32_t xfer alignas(16) [4];
58
59
0
    __asm__ __volatile__(
60
0
        "shl    $0x6,%2;"
61
0
        "je     Ldone_hash_%=;"
62
0
        "add    %1,%2;"
63
0
        "mov    %2,%14;"
64
0
        "mov    (%0),%3;"
65
0
        "mov    0x4(%0),%4;"
66
0
        "mov    0x8(%0),%5;"
67
0
        "mov    0xc(%0),%6;"
68
0
        "mov    0x10(%0),%k2;"
69
0
        "mov    0x14(%0),%7;"
70
0
        "mov    0x18(%0),%8;"
71
0
        "mov    0x1c(%0),%9;"
72
0
        "movdqa %18,%%xmm12;"
73
0
        "movdqa %19,%%xmm10;"
74
0
        "movdqa %20,%%xmm11;"
75
76
0
        "Lloop0_%=:"
77
0
        "lea    %17,%13;"
78
0
        "movdqu (%1),%%xmm4;"
79
0
        "pshufb %%xmm12,%%xmm4;"
80
0
        "movdqu 0x10(%1),%%xmm5;"
81
0
        "pshufb %%xmm12,%%xmm5;"
82
0
        "movdqu 0x20(%1),%%xmm6;"
83
0
        "pshufb %%xmm12,%%xmm6;"
84
0
        "movdqu 0x30(%1),%%xmm7;"
85
0
        "pshufb %%xmm12,%%xmm7;"
86
0
        "mov    %1,%15;"
87
0
        "mov    $3,%1;"
88
89
0
        "Lloop1_%=:"
90
0
        "movdqa 0x0(%13),%%xmm9;"
91
0
        "paddd  %%xmm4,%%xmm9;"
92
0
        "movdqa %%xmm9,%16;"
93
0
        "movdqa %%xmm7,%%xmm0;"
94
0
        "mov    %k2,%10;"
95
0
        "ror    $0xe,%10;"
96
0
        "mov    %3,%11;"
97
0
        "palignr $0x4,%%xmm6,%%xmm0;"
98
0
        "ror    $0x9,%11;"
99
0
        "xor    %k2,%10;"
100
0
        "mov    %7,%12;"
101
0
        "ror    $0x5,%10;"
102
0
        "movdqa %%xmm5,%%xmm1;"
103
0
        "xor    %3,%11;"
104
0
        "xor    %8,%12;"
105
0
        "paddd  %%xmm4,%%xmm0;"
106
0
        "xor    %k2,%10;"
107
0
        "and    %k2,%12;"
108
0
        "ror    $0xb,%11;"
109
0
        "palignr $0x4,%%xmm4,%%xmm1;"
110
0
        "xor    %3,%11;"
111
0
        "ror    $0x6,%10;"
112
0
        "xor    %8,%12;"
113
0
        "movdqa %%xmm1,%%xmm2;"
114
0
        "ror    $0x2,%11;"
115
0
        "add    %10,%12;"
116
0
        "add    %16,%12;"
117
0
        "movdqa %%xmm1,%%xmm3;"
118
0
        "mov    %3,%10;"
119
0
        "add    %12,%9;"
120
0
        "mov    %3,%12;"
121
0
        "pslld  $0x19,%%xmm1;"
122
0
        "or     %5,%10;"
123
0
        "add    %9,%6;"
124
0
        "and    %5,%12;"
125
0
        "psrld  $0x7,%%xmm2;"
126
0
        "and    %4,%10;"
127
0
        "add    %11,%9;"
128
0
        "por    %%xmm2,%%xmm1;"
129
0
        "or     %12,%10;"
130
0
        "add    %10,%9;"
131
0
        "movdqa %%xmm3,%%xmm2;"
132
0
        "mov    %6,%10;"
133
0
        "mov    %9,%11;"
134
0
        "movdqa %%xmm3,%%xmm8;"
135
0
        "ror    $0xe,%10;"
136
0
        "xor    %6,%10;"
137
0
        "mov    %k2,%12;"
138
0
        "ror    $0x9,%11;"
139
0
        "pslld  $0xe,%%xmm3;"
140
0
        "xor    %9,%11;"
141
0
        "ror    $0x5,%10;"
142
0
        "xor    %7,%12;"
143
0
        "psrld  $0x12,%%xmm2;"
144
0
        "ror    $0xb,%11;"
145
0
        "xor    %6,%10;"
146
0
        "and    %6,%12;"
147
0
        "ror    $0x6,%10;"
148
0
        "pxor   %%xmm3,%%xmm1;"
149
0
        "xor    %9,%11;"
150
0
        "xor    %7,%12;"
151
0
        "psrld  $0x3,%%xmm8;"
152
0
        "add    %10,%12;"
153
0
        "add    4+%16,%12;"
154
0
        "ror    $0x2,%11;"
155
0
        "pxor   %%xmm2,%%xmm1;"
156
0
        "mov    %9,%10;"
157
0
        "add    %12,%8;"
158
0
        "mov    %9,%12;"
159
0
        "pxor   %%xmm8,%%xmm1;"
160
0
        "or     %4,%10;"
161
0
        "add    %8,%5;"
162
0
        "and    %4,%12;"
163
0
        "pshufd $0xfa,%%xmm7,%%xmm2;"
164
0
        "and    %3,%10;"
165
0
        "add    %11,%8;"
166
0
        "paddd  %%xmm1,%%xmm0;"
167
0
        "or     %12,%10;"
168
0
        "add    %10,%8;"
169
0
        "movdqa %%xmm2,%%xmm3;"
170
0
        "mov    %5,%10;"
171
0
        "mov    %8,%11;"
172
0
        "ror    $0xe,%10;"
173
0
        "movdqa %%xmm2,%%xmm8;"
174
0
        "xor    %5,%10;"
175
0
        "ror    $0x9,%11;"
176
0
        "mov    %6,%12;"
177
0
        "xor    %8,%11;"
178
0
        "ror    $0x5,%10;"
179
0
        "psrlq  $0x11,%%xmm2;"
180
0
        "xor    %k2,%12;"
181
0
        "psrlq  $0x13,%%xmm3;"
182
0
        "xor    %5,%10;"
183
0
        "and    %5,%12;"
184
0
        "psrld  $0xa,%%xmm8;"
185
0
        "ror    $0xb,%11;"
186
0
        "xor    %8,%11;"
187
0
        "xor    %k2,%12;"
188
0
        "ror    $0x6,%10;"
189
0
        "pxor   %%xmm3,%%xmm2;"
190
0
        "add    %10,%12;"
191
0
        "ror    $0x2,%11;"
192
0
        "add    8+%16,%12;"
193
0
        "pxor   %%xmm2,%%xmm8;"
194
0
        "mov    %8,%10;"
195
0
        "add    %12,%7;"
196
0
        "mov    %8,%12;"
197
0
        "pshufb %%xmm10,%%xmm8;"
198
0
        "or     %3,%10;"
199
0
        "add    %7,%4;"
200
0
        "and    %3,%12;"
201
0
        "paddd  %%xmm8,%%xmm0;"
202
0
        "and    %9,%10;"
203
0
        "add    %11,%7;"
204
0
        "pshufd $0x50,%%xmm0,%%xmm2;"
205
0
        "or     %12,%10;"
206
0
        "add    %10,%7;"
207
0
        "movdqa %%xmm2,%%xmm3;"
208
0
        "mov    %4,%10;"
209
0
        "ror    $0xe,%10;"
210
0
        "mov    %7,%11;"
211
0
        "movdqa %%xmm2,%%xmm4;"
212
0
        "ror    $0x9,%11;"
213
0
        "xor    %4,%10;"
214
0
        "mov    %5,%12;"
215
0
        "ror    $0x5,%10;"
216
0
        "psrlq  $0x11,%%xmm2;"
217
0
        "xor    %7,%11;"
218
0
        "xor    %6,%12;"
219
0
        "psrlq  $0x13,%%xmm3;"
220
0
        "xor    %4,%10;"
221
0
        "and    %4,%12;"
222
0
        "ror    $0xb,%11;"
223
0
        "psrld  $0xa,%%xmm4;"
224
0
        "xor    %7,%11;"
225
0
        "ror    $0x6,%10;"
226
0
        "xor    %6,%12;"
227
0
        "pxor   %%xmm3,%%xmm2;"
228
0
        "ror    $0x2,%11;"
229
0
        "add    %10,%12;"
230
0
        "add    12+%16,%12;"
231
0
        "pxor   %%xmm2,%%xmm4;"
232
0
        "mov    %7,%10;"
233
0
        "add    %12,%k2;"
234
0
        "mov    %7,%12;"
235
0
        "pshufb %%xmm11,%%xmm4;"
236
0
        "or     %9,%10;"
237
0
        "add    %k2,%3;"
238
0
        "and    %9,%12;"
239
0
        "paddd  %%xmm0,%%xmm4;"
240
0
        "and    %8,%10;"
241
0
        "add    %11,%k2;"
242
0
        "or     %12,%10;"
243
0
        "add    %10,%k2;"
244
0
        "movdqa 0x10(%13),%%xmm9;"
245
0
        "paddd  %%xmm5,%%xmm9;"
246
0
        "movdqa %%xmm9,%16;"
247
0
        "movdqa %%xmm4,%%xmm0;"
248
0
        "mov    %3,%10;"
249
0
        "ror    $0xe,%10;"
250
0
        "mov    %k2,%11;"
251
0
        "palignr $0x4,%%xmm7,%%xmm0;"
252
0
        "ror    $0x9,%11;"
253
0
        "xor    %3,%10;"
254
0
        "mov    %4,%12;"
255
0
        "ror    $0x5,%10;"
256
0
        "movdqa %%xmm6,%%xmm1;"
257
0
        "xor    %k2,%11;"
258
0
        "xor    %5,%12;"
259
0
        "paddd  %%xmm5,%%xmm0;"
260
0
        "xor    %3,%10;"
261
0
        "and    %3,%12;"
262
0
        "ror    $0xb,%11;"
263
0
        "palignr $0x4,%%xmm5,%%xmm1;"
264
0
        "xor    %k2,%11;"
265
0
        "ror    $0x6,%10;"
266
0
        "xor    %5,%12;"
267
0
        "movdqa %%xmm1,%%xmm2;"
268
0
        "ror    $0x2,%11;"
269
0
        "add    %10,%12;"
270
0
        "add    %16,%12;"
271
0
        "movdqa %%xmm1,%%xmm3;"
272
0
        "mov    %k2,%10;"
273
0
        "add    %12,%6;"
274
0
        "mov    %k2,%12;"
275
0
        "pslld  $0x19,%%xmm1;"
276
0
        "or     %8,%10;"
277
0
        "add    %6,%9;"
278
0
        "and    %8,%12;"
279
0
        "psrld  $0x7,%%xmm2;"
280
0
        "and    %7,%10;"
281
0
        "add    %11,%6;"
282
0
        "por    %%xmm2,%%xmm1;"
283
0
        "or     %12,%10;"
284
0
        "add    %10,%6;"
285
0
        "movdqa %%xmm3,%%xmm2;"
286
0
        "mov    %9,%10;"
287
0
        "mov    %6,%11;"
288
0
        "movdqa %%xmm3,%%xmm8;"
289
0
        "ror    $0xe,%10;"
290
0
        "xor    %9,%10;"
291
0
        "mov    %3,%12;"
292
0
        "ror    $0x9,%11;"
293
0
        "pslld  $0xe,%%xmm3;"
294
0
        "xor    %6,%11;"
295
0
        "ror    $0x5,%10;"
296
0
        "xor    %4,%12;"
297
0
        "psrld  $0x12,%%xmm2;"
298
0
        "ror    $0xb,%11;"
299
0
        "xor    %9,%10;"
300
0
        "and    %9,%12;"
301
0
        "ror    $0x6,%10;"
302
0
        "pxor   %%xmm3,%%xmm1;"
303
0
        "xor    %6,%11;"
304
0
        "xor    %4,%12;"
305
0
        "psrld  $0x3,%%xmm8;"
306
0
        "add    %10,%12;"
307
0
        "add    4+%16,%12;"
308
0
        "ror    $0x2,%11;"
309
0
        "pxor   %%xmm2,%%xmm1;"
310
0
        "mov    %6,%10;"
311
0
        "add    %12,%5;"
312
0
        "mov    %6,%12;"
313
0
        "pxor   %%xmm8,%%xmm1;"
314
0
        "or     %7,%10;"
315
0
        "add    %5,%8;"
316
0
        "and    %7,%12;"
317
0
        "pshufd $0xfa,%%xmm4,%%xmm2;"
318
0
        "and    %k2,%10;"
319
0
        "add    %11,%5;"
320
0
        "paddd  %%xmm1,%%xmm0;"
321
0
        "or     %12,%10;"
322
0
        "add    %10,%5;"
323
0
        "movdqa %%xmm2,%%xmm3;"
324
0
        "mov    %8,%10;"
325
0
        "mov    %5,%11;"
326
0
        "ror    $0xe,%10;"
327
0
        "movdqa %%xmm2,%%xmm8;"
328
0
        "xor    %8,%10;"
329
0
        "ror    $0x9,%11;"
330
0
        "mov    %9,%12;"
331
0
        "xor    %5,%11;"
332
0
        "ror    $0x5,%10;"
333
0
        "psrlq  $0x11,%%xmm2;"
334
0
        "xor    %3,%12;"
335
0
        "psrlq  $0x13,%%xmm3;"
336
0
        "xor    %8,%10;"
337
0
        "and    %8,%12;"
338
0
        "psrld  $0xa,%%xmm8;"
339
0
        "ror    $0xb,%11;"
340
0
        "xor    %5,%11;"
341
0
        "xor    %3,%12;"
342
0
        "ror    $0x6,%10;"
343
0
        "pxor   %%xmm3,%%xmm2;"
344
0
        "add    %10,%12;"
345
0
        "ror    $0x2,%11;"
346
0
        "add    8+%16,%12;"
347
0
        "pxor   %%xmm2,%%xmm8;"
348
0
        "mov    %5,%10;"
349
0
        "add    %12,%4;"
350
0
        "mov    %5,%12;"
351
0
        "pshufb %%xmm10,%%xmm8;"
352
0
        "or     %k2,%10;"
353
0
        "add    %4,%7;"
354
0
        "and    %k2,%12;"
355
0
        "paddd  %%xmm8,%%xmm0;"
356
0
        "and    %6,%10;"
357
0
        "add    %11,%4;"
358
0
        "pshufd $0x50,%%xmm0,%%xmm2;"
359
0
        "or     %12,%10;"
360
0
        "add    %10,%4;"
361
0
        "movdqa %%xmm2,%%xmm3;"
362
0
        "mov    %7,%10;"
363
0
        "ror    $0xe,%10;"
364
0
        "mov    %4,%11;"
365
0
        "movdqa %%xmm2,%%xmm5;"
366
0
        "ror    $0x9,%11;"
367
0
        "xor    %7,%10;"
368
0
        "mov    %8,%12;"
369
0
        "ror    $0x5,%10;"
370
0
        "psrlq  $0x11,%%xmm2;"
371
0
        "xor    %4,%11;"
372
0
        "xor    %9,%12;"
373
0
        "psrlq  $0x13,%%xmm3;"
374
0
        "xor    %7,%10;"
375
0
        "and    %7,%12;"
376
0
        "ror    $0xb,%11;"
377
0
        "psrld  $0xa,%%xmm5;"
378
0
        "xor    %4,%11;"
379
0
        "ror    $0x6,%10;"
380
0
        "xor    %9,%12;"
381
0
        "pxor   %%xmm3,%%xmm2;"
382
0
        "ror    $0x2,%11;"
383
0
        "add    %10,%12;"
384
0
        "add    12+%16,%12;"
385
0
        "pxor   %%xmm2,%%xmm5;"
386
0
        "mov    %4,%10;"
387
0
        "add    %12,%3;"
388
0
        "mov    %4,%12;"
389
0
        "pshufb %%xmm11,%%xmm5;"
390
0
        "or     %6,%10;"
391
0
        "add    %3,%k2;"
392
0
        "and    %6,%12;"
393
0
        "paddd  %%xmm0,%%xmm5;"
394
0
        "and    %5,%10;"
395
0
        "add    %11,%3;"
396
0
        "or     %12,%10;"
397
0
        "add    %10,%3;"
398
0
        "movdqa 0x20(%13),%%xmm9;"
399
0
        "paddd  %%xmm6,%%xmm9;"
400
0
        "movdqa %%xmm9,%16;"
401
0
        "movdqa %%xmm5,%%xmm0;"
402
0
        "mov    %k2,%10;"
403
0
        "ror    $0xe,%10;"
404
0
        "mov    %3,%11;"
405
0
        "palignr $0x4,%%xmm4,%%xmm0;"
406
0
        "ror    $0x9,%11;"
407
0
        "xor    %k2,%10;"
408
0
        "mov    %7,%12;"
409
0
        "ror    $0x5,%10;"
410
0
        "movdqa %%xmm7,%%xmm1;"
411
0
        "xor    %3,%11;"
412
0
        "xor    %8,%12;"
413
0
        "paddd  %%xmm6,%%xmm0;"
414
0
        "xor    %k2,%10;"
415
0
        "and    %k2,%12;"
416
0
        "ror    $0xb,%11;"
417
0
        "palignr $0x4,%%xmm6,%%xmm1;"
418
0
        "xor    %3,%11;"
419
0
        "ror    $0x6,%10;"
420
0
        "xor    %8,%12;"
421
0
        "movdqa %%xmm1,%%xmm2;"
422
0
        "ror    $0x2,%11;"
423
0
        "add    %10,%12;"
424
0
        "add    %16,%12;"
425
0
        "movdqa %%xmm1,%%xmm3;"
426
0
        "mov    %3,%10;"
427
0
        "add    %12,%9;"
428
0
        "mov    %3,%12;"
429
0
        "pslld  $0x19,%%xmm1;"
430
0
        "or     %5,%10;"
431
0
        "add    %9,%6;"
432
0
        "and    %5,%12;"
433
0
        "psrld  $0x7,%%xmm2;"
434
0
        "and    %4,%10;"
435
0
        "add    %11,%9;"
436
0
        "por    %%xmm2,%%xmm1;"
437
0
        "or     %12,%10;"
438
0
        "add    %10,%9;"
439
0
        "movdqa %%xmm3,%%xmm2;"
440
0
        "mov    %6,%10;"
441
0
        "mov    %9,%11;"
442
0
        "movdqa %%xmm3,%%xmm8;"
443
0
        "ror    $0xe,%10;"
444
0
        "xor    %6,%10;"
445
0
        "mov    %k2,%12;"
446
0
        "ror    $0x9,%11;"
447
0
        "pslld  $0xe,%%xmm3;"
448
0
        "xor    %9,%11;"
449
0
        "ror    $0x5,%10;"
450
0
        "xor    %7,%12;"
451
0
        "psrld  $0x12,%%xmm2;"
452
0
        "ror    $0xb,%11;"
453
0
        "xor    %6,%10;"
454
0
        "and    %6,%12;"
455
0
        "ror    $0x6,%10;"
456
0
        "pxor   %%xmm3,%%xmm1;"
457
0
        "xor    %9,%11;"
458
0
        "xor    %7,%12;"
459
0
        "psrld  $0x3,%%xmm8;"
460
0
        "add    %10,%12;"
461
0
        "add    4+%16,%12;"
462
0
        "ror    $0x2,%11;"
463
0
        "pxor   %%xmm2,%%xmm1;"
464
0
        "mov    %9,%10;"
465
0
        "add    %12,%8;"
466
0
        "mov    %9,%12;"
467
0
        "pxor   %%xmm8,%%xmm1;"
468
0
        "or     %4,%10;"
469
0
        "add    %8,%5;"
470
0
        "and    %4,%12;"
471
0
        "pshufd $0xfa,%%xmm5,%%xmm2;"
472
0
        "and    %3,%10;"
473
0
        "add    %11,%8;"
474
0
        "paddd  %%xmm1,%%xmm0;"
475
0
        "or     %12,%10;"
476
0
        "add    %10,%8;"
477
0
        "movdqa %%xmm2,%%xmm3;"
478
0
        "mov    %5,%10;"
479
0
        "mov    %8,%11;"
480
0
        "ror    $0xe,%10;"
481
0
        "movdqa %%xmm2,%%xmm8;"
482
0
        "xor    %5,%10;"
483
0
        "ror    $0x9,%11;"
484
0
        "mov    %6,%12;"
485
0
        "xor    %8,%11;"
486
0
        "ror    $0x5,%10;"
487
0
        "psrlq  $0x11,%%xmm2;"
488
0
        "xor    %k2,%12;"
489
0
        "psrlq  $0x13,%%xmm3;"
490
0
        "xor    %5,%10;"
491
0
        "and    %5,%12;"
492
0
        "psrld  $0xa,%%xmm8;"
493
0
        "ror    $0xb,%11;"
494
0
        "xor    %8,%11;"
495
0
        "xor    %k2,%12;"
496
0
        "ror    $0x6,%10;"
497
0
        "pxor   %%xmm3,%%xmm2;"
498
0
        "add    %10,%12;"
499
0
        "ror    $0x2,%11;"
500
0
        "add    8+%16,%12;"
501
0
        "pxor   %%xmm2,%%xmm8;"
502
0
        "mov    %8,%10;"
503
0
        "add    %12,%7;"
504
0
        "mov    %8,%12;"
505
0
        "pshufb %%xmm10,%%xmm8;"
506
0
        "or     %3,%10;"
507
0
        "add    %7,%4;"
508
0
        "and    %3,%12;"
509
0
        "paddd  %%xmm8,%%xmm0;"
510
0
        "and    %9,%10;"
511
0
        "add    %11,%7;"
512
0
        "pshufd $0x50,%%xmm0,%%xmm2;"
513
0
        "or     %12,%10;"
514
0
        "add    %10,%7;"
515
0
        "movdqa %%xmm2,%%xmm3;"
516
0
        "mov    %4,%10;"
517
0
        "ror    $0xe,%10;"
518
0
        "mov    %7,%11;"
519
0
        "movdqa %%xmm2,%%xmm6;"
520
0
        "ror    $0x9,%11;"
521
0
        "xor    %4,%10;"
522
0
        "mov    %5,%12;"
523
0
        "ror    $0x5,%10;"
524
0
        "psrlq  $0x11,%%xmm2;"
525
0
        "xor    %7,%11;"
526
0
        "xor    %6,%12;"
527
0
        "psrlq  $0x13,%%xmm3;"
528
0
        "xor    %4,%10;"
529
0
        "and    %4,%12;"
530
0
        "ror    $0xb,%11;"
531
0
        "psrld  $0xa,%%xmm6;"
532
0
        "xor    %7,%11;"
533
0
        "ror    $0x6,%10;"
534
0
        "xor    %6,%12;"
535
0
        "pxor   %%xmm3,%%xmm2;"
536
0
        "ror    $0x2,%11;"
537
0
        "add    %10,%12;"
538
0
        "add    12+%16,%12;"
539
0
        "pxor   %%xmm2,%%xmm6;"
540
0
        "mov    %7,%10;"
541
0
        "add    %12,%k2;"
542
0
        "mov    %7,%12;"
543
0
        "pshufb %%xmm11,%%xmm6;"
544
0
        "or     %9,%10;"
545
0
        "add    %k2,%3;"
546
0
        "and    %9,%12;"
547
0
        "paddd  %%xmm0,%%xmm6;"
548
0
        "and    %8,%10;"
549
0
        "add    %11,%k2;"
550
0
        "or     %12,%10;"
551
0
        "add    %10,%k2;"
552
0
        "movdqa 0x30(%13),%%xmm9;"
553
0
        "paddd  %%xmm7,%%xmm9;"
554
0
        "movdqa %%xmm9,%16;"
555
0
        "add    $0x40,%13;"
556
0
        "movdqa %%xmm6,%%xmm0;"
557
0
        "mov    %3,%10;"
558
0
        "ror    $0xe,%10;"
559
0
        "mov    %k2,%11;"
560
0
        "palignr $0x4,%%xmm5,%%xmm0;"
561
0
        "ror    $0x9,%11;"
562
0
        "xor    %3,%10;"
563
0
        "mov    %4,%12;"
564
0
        "ror    $0x5,%10;"
565
0
        "movdqa %%xmm4,%%xmm1;"
566
0
        "xor    %k2,%11;"
567
0
        "xor    %5,%12;"
568
0
        "paddd  %%xmm7,%%xmm0;"
569
0
        "xor    %3,%10;"
570
0
        "and    %3,%12;"
571
0
        "ror    $0xb,%11;"
572
0
        "palignr $0x4,%%xmm7,%%xmm1;"
573
0
        "xor    %k2,%11;"
574
0
        "ror    $0x6,%10;"
575
0
        "xor    %5,%12;"
576
0
        "movdqa %%xmm1,%%xmm2;"
577
0
        "ror    $0x2,%11;"
578
0
        "add    %10,%12;"
579
0
        "add    %16,%12;"
580
0
        "movdqa %%xmm1,%%xmm3;"
581
0
        "mov    %k2,%10;"
582
0
        "add    %12,%6;"
583
0
        "mov    %k2,%12;"
584
0
        "pslld  $0x19,%%xmm1;"
585
0
        "or     %8,%10;"
586
0
        "add    %6,%9;"
587
0
        "and    %8,%12;"
588
0
        "psrld  $0x7,%%xmm2;"
589
0
        "and    %7,%10;"
590
0
        "add    %11,%6;"
591
0
        "por    %%xmm2,%%xmm1;"
592
0
        "or     %12,%10;"
593
0
        "add    %10,%6;"
594
0
        "movdqa %%xmm3,%%xmm2;"
595
0
        "mov    %9,%10;"
596
0
        "mov    %6,%11;"
597
0
        "movdqa %%xmm3,%%xmm8;"
598
0
        "ror    $0xe,%10;"
599
0
        "xor    %9,%10;"
600
0
        "mov    %3,%12;"
601
0
        "ror    $0x9,%11;"
602
0
        "pslld  $0xe,%%xmm3;"
603
0
        "xor    %6,%11;"
604
0
        "ror    $0x5,%10;"
605
0
        "xor    %4,%12;"
606
0
        "psrld  $0x12,%%xmm2;"
607
0
        "ror    $0xb,%11;"
608
0
        "xor    %9,%10;"
609
0
        "and    %9,%12;"
610
0
        "ror    $0x6,%10;"
611
0
        "pxor   %%xmm3,%%xmm1;"
612
0
        "xor    %6,%11;"
613
0
        "xor    %4,%12;"
614
0
        "psrld  $0x3,%%xmm8;"
615
0
        "add    %10,%12;"
616
0
        "add    4+%16,%12;"
617
0
        "ror    $0x2,%11;"
618
0
        "pxor   %%xmm2,%%xmm1;"
619
0
        "mov    %6,%10;"
620
0
        "add    %12,%5;"
621
0
        "mov    %6,%12;"
622
0
        "pxor   %%xmm8,%%xmm1;"
623
0
        "or     %7,%10;"
624
0
        "add    %5,%8;"
625
0
        "and    %7,%12;"
626
0
        "pshufd $0xfa,%%xmm6,%%xmm2;"
627
0
        "and    %k2,%10;"
628
0
        "add    %11,%5;"
629
0
        "paddd  %%xmm1,%%xmm0;"
630
0
        "or     %12,%10;"
631
0
        "add    %10,%5;"
632
0
        "movdqa %%xmm2,%%xmm3;"
633
0
        "mov    %8,%10;"
634
0
        "mov    %5,%11;"
635
0
        "ror    $0xe,%10;"
636
0
        "movdqa %%xmm2,%%xmm8;"
637
0
        "xor    %8,%10;"
638
0
        "ror    $0x9,%11;"
639
0
        "mov    %9,%12;"
640
0
        "xor    %5,%11;"
641
0
        "ror    $0x5,%10;"
642
0
        "psrlq  $0x11,%%xmm2;"
643
0
        "xor    %3,%12;"
644
0
        "psrlq  $0x13,%%xmm3;"
645
0
        "xor    %8,%10;"
646
0
        "and    %8,%12;"
647
0
        "psrld  $0xa,%%xmm8;"
648
0
        "ror    $0xb,%11;"
649
0
        "xor    %5,%11;"
650
0
        "xor    %3,%12;"
651
0
        "ror    $0x6,%10;"
652
0
        "pxor   %%xmm3,%%xmm2;"
653
0
        "add    %10,%12;"
654
0
        "ror    $0x2,%11;"
655
0
        "add    8+%16,%12;"
656
0
        "pxor   %%xmm2,%%xmm8;"
657
0
        "mov    %5,%10;"
658
0
        "add    %12,%4;"
659
0
        "mov    %5,%12;"
660
0
        "pshufb %%xmm10,%%xmm8;"
661
0
        "or     %k2,%10;"
662
0
        "add    %4,%7;"
663
0
        "and    %k2,%12;"
664
0
        "paddd  %%xmm8,%%xmm0;"
665
0
        "and    %6,%10;"
666
0
        "add    %11,%4;"
667
0
        "pshufd $0x50,%%xmm0,%%xmm2;"
668
0
        "or     %12,%10;"
669
0
        "add    %10,%4;"
670
0
        "movdqa %%xmm2,%%xmm3;"
671
0
        "mov    %7,%10;"
672
0
        "ror    $0xe,%10;"
673
0
        "mov    %4,%11;"
674
0
        "movdqa %%xmm2,%%xmm7;"
675
0
        "ror    $0x9,%11;"
676
0
        "xor    %7,%10;"
677
0
        "mov    %8,%12;"
678
0
        "ror    $0x5,%10;"
679
0
        "psrlq  $0x11,%%xmm2;"
680
0
        "xor    %4,%11;"
681
0
        "xor    %9,%12;"
682
0
        "psrlq  $0x13,%%xmm3;"
683
0
        "xor    %7,%10;"
684
0
        "and    %7,%12;"
685
0
        "ror    $0xb,%11;"
686
0
        "psrld  $0xa,%%xmm7;"
687
0
        "xor    %4,%11;"
688
0
        "ror    $0x6,%10;"
689
0
        "xor    %9,%12;"
690
0
        "pxor   %%xmm3,%%xmm2;"
691
0
        "ror    $0x2,%11;"
692
0
        "add    %10,%12;"
693
0
        "add    12+%16,%12;"
694
0
        "pxor   %%xmm2,%%xmm7;"
695
0
        "mov    %4,%10;"
696
0
        "add    %12,%3;"
697
0
        "mov    %4,%12;"
698
0
        "pshufb %%xmm11,%%xmm7;"
699
0
        "or     %6,%10;"
700
0
        "add    %3,%k2;"
701
0
        "and    %6,%12;"
702
0
        "paddd  %%xmm0,%%xmm7;"
703
0
        "and    %5,%10;"
704
0
        "add    %11,%3;"
705
0
        "or     %12,%10;"
706
0
        "add    %10,%3;"
707
0
        "sub    $0x1,%1;"
708
0
        "jne    Lloop1_%=;"
709
0
        "mov    $0x2,%1;"
710
711
0
        "Lloop2_%=:"
712
0
        "paddd  0x0(%13),%%xmm4;"
713
0
        "movdqa %%xmm4,%16;"
714
0
        "mov    %k2,%10;"
715
0
        "ror    $0xe,%10;"
716
0
        "mov    %3,%11;"
717
0
        "xor    %k2,%10;"
718
0
        "ror    $0x9,%11;"
719
0
        "mov    %7,%12;"
720
0
        "xor    %3,%11;"
721
0
        "ror    $0x5,%10;"
722
0
        "xor    %8,%12;"
723
0
        "xor    %k2,%10;"
724
0
        "ror    $0xb,%11;"
725
0
        "and    %k2,%12;"
726
0
        "xor    %3,%11;"
727
0
        "ror    $0x6,%10;"
728
0
        "xor    %8,%12;"
729
0
        "add    %10,%12;"
730
0
        "ror    $0x2,%11;"
731
0
        "add    %16,%12;"
732
0
        "mov    %3,%10;"
733
0
        "add    %12,%9;"
734
0
        "mov    %3,%12;"
735
0
        "or     %5,%10;"
736
0
        "add    %9,%6;"
737
0
        "and    %5,%12;"
738
0
        "and    %4,%10;"
739
0
        "add    %11,%9;"
740
0
        "or     %12,%10;"
741
0
        "add    %10,%9;"
742
0
        "mov    %6,%10;"
743
0
        "ror    $0xe,%10;"
744
0
        "mov    %9,%11;"
745
0
        "xor    %6,%10;"
746
0
        "ror    $0x9,%11;"
747
0
        "mov    %k2,%12;"
748
0
        "xor    %9,%11;"
749
0
        "ror    $0x5,%10;"
750
0
        "xor    %7,%12;"
751
0
        "xor    %6,%10;"
752
0
        "ror    $0xb,%11;"
753
0
        "and    %6,%12;"
754
0
        "xor    %9,%11;"
755
0
        "ror    $0x6,%10;"
756
0
        "xor    %7,%12;"
757
0
        "add    %10,%12;"
758
0
        "ror    $0x2,%11;"
759
0
        "add    4+%16,%12;"
760
0
        "mov    %9,%10;"
761
0
        "add    %12,%8;"
762
0
        "mov    %9,%12;"
763
0
        "or     %4,%10;"
764
0
        "add    %8,%5;"
765
0
        "and    %4,%12;"
766
0
        "and    %3,%10;"
767
0
        "add    %11,%8;"
768
0
        "or     %12,%10;"
769
0
        "add    %10,%8;"
770
0
        "mov    %5,%10;"
771
0
        "ror    $0xe,%10;"
772
0
        "mov    %8,%11;"
773
0
        "xor    %5,%10;"
774
0
        "ror    $0x9,%11;"
775
0
        "mov    %6,%12;"
776
0
        "xor    %8,%11;"
777
0
        "ror    $0x5,%10;"
778
0
        "xor    %k2,%12;"
779
0
        "xor    %5,%10;"
780
0
        "ror    $0xb,%11;"
781
0
        "and    %5,%12;"
782
0
        "xor    %8,%11;"
783
0
        "ror    $0x6,%10;"
784
0
        "xor    %k2,%12;"
785
0
        "add    %10,%12;"
786
0
        "ror    $0x2,%11;"
787
0
        "add    8+%16,%12;"
788
0
        "mov    %8,%10;"
789
0
        "add    %12,%7;"
790
0
        "mov    %8,%12;"
791
0
        "or     %3,%10;"
792
0
        "add    %7,%4;"
793
0
        "and    %3,%12;"
794
0
        "and    %9,%10;"
795
0
        "add    %11,%7;"
796
0
        "or     %12,%10;"
797
0
        "add    %10,%7;"
798
0
        "mov    %4,%10;"
799
0
        "ror    $0xe,%10;"
800
0
        "mov    %7,%11;"
801
0
        "xor    %4,%10;"
802
0
        "ror    $0x9,%11;"
803
0
        "mov    %5,%12;"
804
0
        "xor    %7,%11;"
805
0
        "ror    $0x5,%10;"
806
0
        "xor    %6,%12;"
807
0
        "xor    %4,%10;"
808
0
        "ror    $0xb,%11;"
809
0
        "and    %4,%12;"
810
0
        "xor    %7,%11;"
811
0
        "ror    $0x6,%10;"
812
0
        "xor    %6,%12;"
813
0
        "add    %10,%12;"
814
0
        "ror    $0x2,%11;"
815
0
        "add    12+%16,%12;"
816
0
        "mov    %7,%10;"
817
0
        "add    %12,%k2;"
818
0
        "mov    %7,%12;"
819
0
        "or     %9,%10;"
820
0
        "add    %k2,%3;"
821
0
        "and    %9,%12;"
822
0
        "and    %8,%10;"
823
0
        "add    %11,%k2;"
824
0
        "or     %12,%10;"
825
0
        "add    %10,%k2;"
826
0
        "paddd  0x10(%13),%%xmm5;"
827
0
        "movdqa %%xmm5,%16;"
828
0
        "add    $0x20,%13;"
829
0
        "mov    %3,%10;"
830
0
        "ror    $0xe,%10;"
831
0
        "mov    %k2,%11;"
832
0
        "xor    %3,%10;"
833
0
        "ror    $0x9,%11;"
834
0
        "mov    %4,%12;"
835
0
        "xor    %k2,%11;"
836
0
        "ror    $0x5,%10;"
837
0
        "xor    %5,%12;"
838
0
        "xor    %3,%10;"
839
0
        "ror    $0xb,%11;"
840
0
        "and    %3,%12;"
841
0
        "xor    %k2,%11;"
842
0
        "ror    $0x6,%10;"
843
0
        "xor    %5,%12;"
844
0
        "add    %10,%12;"
845
0
        "ror    $0x2,%11;"
846
0
        "add    %16,%12;"
847
0
        "mov    %k2,%10;"
848
0
        "add    %12,%6;"
849
0
        "mov    %k2,%12;"
850
0
        "or     %8,%10;"
851
0
        "add    %6,%9;"
852
0
        "and    %8,%12;"
853
0
        "and    %7,%10;"
854
0
        "add    %11,%6;"
855
0
        "or     %12,%10;"
856
0
        "add    %10,%6;"
857
0
        "mov    %9,%10;"
858
0
        "ror    $0xe,%10;"
859
0
        "mov    %6,%11;"
860
0
        "xor    %9,%10;"
861
0
        "ror    $0x9,%11;"
862
0
        "mov    %3,%12;"
863
0
        "xor    %6,%11;"
864
0
        "ror    $0x5,%10;"
865
0
        "xor    %4,%12;"
866
0
        "xor    %9,%10;"
867
0
        "ror    $0xb,%11;"
868
0
        "and    %9,%12;"
869
0
        "xor    %6,%11;"
870
0
        "ror    $0x6,%10;"
871
0
        "xor    %4,%12;"
872
0
        "add    %10,%12;"
873
0
        "ror    $0x2,%11;"
874
0
        "add    4+%16,%12;"
875
0
        "mov    %6,%10;"
876
0
        "add    %12,%5;"
877
0
        "mov    %6,%12;"
878
0
        "or     %7,%10;"
879
0
        "add    %5,%8;"
880
0
        "and    %7,%12;"
881
0
        "and    %k2,%10;"
882
0
        "add    %11,%5;"
883
0
        "or     %12,%10;"
884
0
        "add    %10,%5;"
885
0
        "mov    %8,%10;"
886
0
        "ror    $0xe,%10;"
887
0
        "mov    %5,%11;"
888
0
        "xor    %8,%10;"
889
0
        "ror    $0x9,%11;"
890
0
        "mov    %9,%12;"
891
0
        "xor    %5,%11;"
892
0
        "ror    $0x5,%10;"
893
0
        "xor    %3,%12;"
894
0
        "xor    %8,%10;"
895
0
        "ror    $0xb,%11;"
896
0
        "and    %8,%12;"
897
0
        "xor    %5,%11;"
898
0
        "ror    $0x6,%10;"
899
0
        "xor    %3,%12;"
900
0
        "add    %10,%12;"
901
0
        "ror    $0x2,%11;"
902
0
        "add    8+%16,%12;"
903
0
        "mov    %5,%10;"
904
0
        "add    %12,%4;"
905
0
        "mov    %5,%12;"
906
0
        "or     %k2,%10;"
907
0
        "add    %4,%7;"
908
0
        "and    %k2,%12;"
909
0
        "and    %6,%10;"
910
0
        "add    %11,%4;"
911
0
        "or     %12,%10;"
912
0
        "add    %10,%4;"
913
0
        "mov    %7,%10;"
914
0
        "ror    $0xe,%10;"
915
0
        "mov    %4,%11;"
916
0
        "xor    %7,%10;"
917
0
        "ror    $0x9,%11;"
918
0
        "mov    %8,%12;"
919
0
        "xor    %4,%11;"
920
0
        "ror    $0x5,%10;"
921
0
        "xor    %9,%12;"
922
0
        "xor    %7,%10;"
923
0
        "ror    $0xb,%11;"
924
0
        "and    %7,%12;"
925
0
        "xor    %4,%11;"
926
0
        "ror    $0x6,%10;"
927
0
        "xor    %9,%12;"
928
0
        "add    %10,%12;"
929
0
        "ror    $0x2,%11;"
930
0
        "add    12+%16,%12;"
931
0
        "mov    %4,%10;"
932
0
        "add    %12,%3;"
933
0
        "mov    %4,%12;"
934
0
        "or     %6,%10;"
935
0
        "add    %3,%k2;"
936
0
        "and    %6,%12;"
937
0
        "and    %5,%10;"
938
0
        "add    %11,%3;"
939
0
        "or     %12,%10;"
940
0
        "add    %10,%3;"
941
0
        "movdqa %%xmm6,%%xmm4;"
942
0
        "movdqa %%xmm7,%%xmm5;"
943
0
        "sub    $0x1,%1;"
944
0
        "jne    Lloop2_%=;"
945
0
        "add    (%0),%3;"
946
0
        "mov    %3,(%0);"
947
0
        "add    0x4(%0),%4;"
948
0
        "mov    %4,0x4(%0);"
949
0
        "add    0x8(%0),%5;"
950
0
        "mov    %5,0x8(%0);"
951
0
        "add    0xc(%0),%6;"
952
0
        "mov    %6,0xc(%0);"
953
0
        "add    0x10(%0),%k2;"
954
0
        "mov    %k2,0x10(%0);"
955
0
        "add    0x14(%0),%7;"
956
0
        "mov    %7,0x14(%0);"
957
0
        "add    0x18(%0),%8;"
958
0
        "mov    %8,0x18(%0);"
959
0
        "add    0x1c(%0),%9;"
960
0
        "mov    %9,0x1c(%0);"
961
0
        "mov    %15,%1;"
962
0
        "add    $0x40,%1;"
963
0
        "cmp    %14,%1;"
964
0
        "jne    Lloop0_%=;"
965
966
0
        "Ldone_hash_%=:"
967
968
0
        : "+r"(s), "+r"(chunk), "+r"(blocks), "=r"(a), "=r"(b), "=r"(c), "=r"(d), /* e = chunk */ "=r"(f), "=r"(g), "=r"(h), "=r"(y0), "=r"(y1), "=r"(y2), "=r"(tbl), "+m"(inp_end), "+m"(inp), "+m"(xfer)
969
0
        : "m"(K256), "m"(FLIP_MASK), "m"(SHUF_00BA), "m"(SHUF_DC00)
970
0
        : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12"
971
0
   );
972
0
}
973
}
974
975
/*
976
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
977
; Copyright (c) 2012, Intel Corporation
978
;
979
; All rights reserved.
980
;
981
; Redistribution and use in source and binary forms, with or without
982
; modification, are permitted provided that the following conditions are
983
; met:
984
;
985
; * Redistributions of source code must retain the above copyright
986
;   notice, this list of conditions and the following disclaimer.
987
;
988
; * Redistributions in binary form must reproduce the above copyright
989
;   notice, this list of conditions and the following disclaimer in the
990
;   documentation and/or other materials provided with the
991
;   distribution.
992
;
993
; * Neither the name of the Intel Corporation nor the names of its
994
;   contributors may be used to endorse or promote products derived from
995
;   this software without specific prior written permission.
996
;
997
;
998
; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
999
; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1000
; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
1001
; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
1002
; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
1003
; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
1004
; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
1005
; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
1006
; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
1007
; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
1008
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1009
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1010
;
1011
; Example YASM command lines:
1012
; Windows:  yasm -Xvc -f x64 -rnasm -pnasm -o sha256_sse4.obj -g cv8 sha256_sse4.asm
1013
; Linux:    yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_sse4.o sha256_sse4.asm
1014
;
1015
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1016
;
1017
; This code is described in an Intel White-Paper:
1018
; "Fast SHA-256 Implementations on Intel Architecture Processors"
1019
;
1020
; To find it, surf to https://www.intel.com/p/en_US/embedded
1021
; and search for that title.
1022
; The paper is expected to be released roughly at the end of April, 2012
1023
;
1024
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1025
; This code schedules 1 blocks at a time, with 4 lanes per block
1026
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1027
1028
%define MOVDQ movdqu ;; assume buffers not aligned
1029
1030
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
1031
1032
; addm [mem], reg
1033
; Add reg to mem using reg-mem add and store
1034
%macro addm 2
1035
    add %2, %1
1036
    mov %1, %2
1037
%endm
1038
1039
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1040
1041
; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
1042
; Load xmm with mem and byte swap each dword
1043
%macro COPY_XMM_AND_BSWAP 3
1044
    MOVDQ %1, %2
1045
    pshufb %1, %3
1046
%endmacro
1047
1048
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1049
1050
%define X0 xmm4
1051
%define X1 xmm5
1052
%define X2 xmm6
1053
%define X3 xmm7
1054
1055
%define XTMP0 xmm0
1056
%define XTMP1 xmm1
1057
%define XTMP2 xmm2
1058
%define XTMP3 xmm3
1059
%define XTMP4 xmm8
1060
%define XFER  xmm9
1061
1062
%define SHUF_00BA   xmm10 ; shuffle xBxA -> 00BA
1063
%define SHUF_DC00   xmm11 ; shuffle xDxC -> DC00
1064
%define BYTE_FLIP_MASK  xmm12
1065
1066
%ifdef LINUX
1067
%define NUM_BLKS rdx    ; 3rd arg
1068
%define CTX rsi ; 2nd arg
1069
%define INP rdi ; 1st arg
1070
1071
%define SRND    rdi ; clobbers INP
1072
%define c   ecx
1073
%define d   r8d
1074
%define e   edx
1075
%else
1076
%define NUM_BLKS r8 ; 3rd arg
1077
%define CTX rdx     ; 2nd arg
1078
%define INP rcx     ; 1st arg
1079
1080
%define SRND    rcx ; clobbers INP
1081
%define c   edi
1082
%define d   esi
1083
%define e   r8d
1084
1085
%endif
1086
%define TBL rbp
1087
%define a eax
1088
%define b ebx
1089
1090
%define f r9d
1091
%define g r10d
1092
%define h r11d
1093
1094
%define y0 r13d
1095
%define y1 r14d
1096
%define y2 r15d
1097
1098
1099
1100
_INP_END_SIZE   equ 8
1101
_INP_SIZE   equ 8
1102
_XFER_SIZE  equ 8
1103
%ifdef LINUX
1104
_XMM_SAVE_SIZE  equ 0
1105
%else
1106
_XMM_SAVE_SIZE  equ 7*16
1107
%endif
1108
; STACK_SIZE plus pushes must be an odd multiple of 8
1109
_ALIGN_SIZE equ 8
1110
1111
_INP_END    equ 0
1112
_INP        equ _INP_END  + _INP_END_SIZE
1113
_XFER       equ _INP      + _INP_SIZE
1114
_XMM_SAVE   equ _XFER     + _XFER_SIZE + _ALIGN_SIZE
1115
STACK_SIZE  equ _XMM_SAVE + _XMM_SAVE_SIZE
1116
1117
; rotate_Xs
1118
; Rotate values of symbols X0...X3
1119
%macro rotate_Xs 0
1120
%xdefine X_ X0
1121
%xdefine X0 X1
1122
%xdefine X1 X2
1123
%xdefine X2 X3
1124
%xdefine X3 X_
1125
%endm
1126
1127
; ROTATE_ARGS
1128
; Rotate values of symbols a...h
1129
%macro ROTATE_ARGS 0
1130
%xdefine TMP_ h
1131
%xdefine h g
1132
%xdefine g f
1133
%xdefine f e
1134
%xdefine e d
1135
%xdefine d c
1136
%xdefine c b
1137
%xdefine b a
1138
%xdefine a TMP_
1139
%endm
1140
1141
%macro FOUR_ROUNDS_AND_SCHED 0
1142
    ;; compute s0 four at a time and s1 two at a time
1143
    ;; compute W[-16] + W[-7] 4 at a time
1144
    movdqa  XTMP0, X3
1145
    mov y0, e       ; y0 = e
1146
    ror y0, (25-11) ; y0 = e >> (25-11)
1147
    mov y1, a       ; y1 = a
1148
    palignr XTMP0, X2, 4    ; XTMP0 = W[-7]
1149
    ror y1, (22-13) ; y1 = a >> (22-13)
1150
    xor y0, e       ; y0 = e ^ (e >> (25-11))
1151
    mov y2, f       ; y2 = f
1152
    ror y0, (11-6)  ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1153
    movdqa  XTMP1, X1
1154
    xor y1, a       ; y1 = a ^ (a >> (22-13)
1155
    xor y2, g       ; y2 = f^g
1156
    paddd   XTMP0, X0   ; XTMP0 = W[-7] + W[-16]
1157
    xor y0, e       ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1158
    and y2, e       ; y2 = (f^g)&e
1159
    ror y1, (13-2)  ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1160
    ;; compute s0
1161
    palignr XTMP1, X0, 4    ; XTMP1 = W[-15]
1162
    xor y1, a       ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1163
    ror y0, 6       ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1164
    xor y2, g       ; y2 = CH = ((f^g)&e)^g
1165
    movdqa  XTMP2, XTMP1    ; XTMP2 = W[-15]
1166
    ror y1, 2       ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1167
    add y2, y0      ; y2 = S1 + CH
1168
    add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
1169
    movdqa  XTMP3, XTMP1    ; XTMP3 = W[-15]
1170
    mov y0, a       ; y0 = a
1171
    add h, y2       ; h = h + S1 + CH + k + w
1172
    mov y2, a       ; y2 = a
1173
    pslld   XTMP1, (32-7)
1174
    or  y0, c       ; y0 = a|c
1175
    add d, h        ; d = d + h + S1 + CH + k + w
1176
    and y2, c       ; y2 = a&c
1177
    psrld   XTMP2, 7
1178
    and y0, b       ; y0 = (a|c)&b
1179
    add h, y1       ; h = h + S1 + CH + k + w + S0
1180
    por XTMP1, XTMP2    ; XTMP1 = W[-15] ror 7
1181
    or  y0, y2      ; y0 = MAJ = (a|c)&b)|(a&c)
1182
    add h, y0       ; h = h + S1 + CH + k + w + S0 + MAJ
1183
1184
ROTATE_ARGS
1185
    movdqa  XTMP2, XTMP3    ; XTMP2 = W[-15]
1186
    mov y0, e       ; y0 = e
1187
    mov y1, a       ; y1 = a
1188
    movdqa  XTMP4, XTMP3    ; XTMP4 = W[-15]
1189
    ror y0, (25-11) ; y0 = e >> (25-11)
1190
    xor y0, e       ; y0 = e ^ (e >> (25-11))
1191
    mov y2, f       ; y2 = f
1192
    ror y1, (22-13) ; y1 = a >> (22-13)
1193
    pslld   XTMP3, (32-18)
1194
    xor y1, a       ; y1 = a ^ (a >> (22-13)
1195
    ror y0, (11-6)  ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1196
    xor y2, g       ; y2 = f^g
1197
    psrld   XTMP2, 18
1198
    ror y1, (13-2)  ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1199
    xor y0, e       ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1200
    and y2, e       ; y2 = (f^g)&e
1201
    ror y0, 6       ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1202
    pxor    XTMP1, XTMP3
1203
    xor y1, a       ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1204
    xor y2, g       ; y2 = CH = ((f^g)&e)^g
1205
    psrld   XTMP4, 3    ; XTMP4 = W[-15] >> 3
1206
    add y2, y0      ; y2 = S1 + CH
1207
    add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
1208
    ror y1, 2       ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1209
    pxor    XTMP1, XTMP2    ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
1210
    mov y0, a       ; y0 = a
1211
    add h, y2       ; h = h + S1 + CH + k + w
1212
    mov y2, a       ; y2 = a
1213
    pxor    XTMP1, XTMP4    ; XTMP1 = s0
1214
    or  y0, c       ; y0 = a|c
1215
    add d, h        ; d = d + h + S1 + CH + k + w
1216
    and y2, c       ; y2 = a&c
1217
    ;; compute low s1
1218
    pshufd  XTMP2, X3, 11111010b    ; XTMP2 = W[-2] {BBAA}
1219
    and y0, b       ; y0 = (a|c)&b
1220
    add h, y1       ; h = h + S1 + CH + k + w + S0
1221
    paddd   XTMP0, XTMP1    ; XTMP0 = W[-16] + W[-7] + s0
1222
    or  y0, y2      ; y0 = MAJ = (a|c)&b)|(a&c)
1223
    add h, y0       ; h = h + S1 + CH + k + w + S0 + MAJ
1224
1225
ROTATE_ARGS
1226
    movdqa  XTMP3, XTMP2    ; XTMP3 = W[-2] {BBAA}
1227
    mov y0, e       ; y0 = e
1228
    mov y1, a       ; y1 = a
1229
    ror y0, (25-11) ; y0 = e >> (25-11)
1230
    movdqa  XTMP4, XTMP2    ; XTMP4 = W[-2] {BBAA}
1231
    xor y0, e       ; y0 = e ^ (e >> (25-11))
1232
    ror y1, (22-13) ; y1 = a >> (22-13)
1233
    mov y2, f       ; y2 = f
1234
    xor y1, a       ; y1 = a ^ (a >> (22-13)
1235
    ror y0, (11-6)  ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1236
    psrlq   XTMP2, 17   ; XTMP2 = W[-2] ror 17 {xBxA}
1237
    xor y2, g       ; y2 = f^g
1238
    psrlq   XTMP3, 19   ; XTMP3 = W[-2] ror 19 {xBxA}
1239
    xor y0, e       ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1240
    and y2, e       ; y2 = (f^g)&e
1241
    psrld   XTMP4, 10   ; XTMP4 = W[-2] >> 10 {BBAA}
1242
    ror y1, (13-2)  ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1243
    xor y1, a       ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1244
    xor y2, g       ; y2 = CH = ((f^g)&e)^g
1245
    ror y0, 6       ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1246
    pxor    XTMP2, XTMP3
1247
    add y2, y0      ; y2 = S1 + CH
1248
    ror y1, 2       ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1249
    add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
1250
    pxor    XTMP4, XTMP2    ; XTMP4 = s1 {xBxA}
1251
    mov y0, a       ; y0 = a
1252
    add h, y2       ; h = h + S1 + CH + k + w
1253
    mov y2, a       ; y2 = a
1254
    pshufb  XTMP4, SHUF_00BA    ; XTMP4 = s1 {00BA}
1255
    or  y0, c       ; y0 = a|c
1256
    add d, h        ; d = d + h + S1 + CH + k + w
1257
    and y2, c       ; y2 = a&c
1258
    paddd   XTMP0, XTMP4    ; XTMP0 = {..., ..., W[1], W[0]}
1259
    and y0, b       ; y0 = (a|c)&b
1260
    add h, y1       ; h = h + S1 + CH + k + w + S0
1261
    ;; compute high s1
1262
    pshufd  XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
1263
    or  y0, y2      ; y0 = MAJ = (a|c)&b)|(a&c)
1264
    add h, y0       ; h = h + S1 + CH + k + w + S0 + MAJ
1265
1266
ROTATE_ARGS
1267
    movdqa  XTMP3, XTMP2    ; XTMP3 = W[-2] {DDCC}
1268
    mov y0, e       ; y0 = e
1269
    ror y0, (25-11) ; y0 = e >> (25-11)
1270
    mov y1, a       ; y1 = a
1271
    movdqa  X0,    XTMP2    ; X0    = W[-2] {DDCC}
1272
    ror y1, (22-13) ; y1 = a >> (22-13)
1273
    xor y0, e       ; y0 = e ^ (e >> (25-11))
1274
    mov y2, f       ; y2 = f
1275
    ror y0, (11-6)  ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1276
    psrlq   XTMP2, 17   ; XTMP2 = W[-2] ror 17 {xDxC}
1277
    xor y1, a       ; y1 = a ^ (a >> (22-13)
1278
    xor y2, g       ; y2 = f^g
1279
    psrlq   XTMP3, 19   ; XTMP3 = W[-2] ror 19 {xDxC}
1280
    xor y0, e       ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1281
    and y2, e       ; y2 = (f^g)&e
1282
    ror y1, (13-2)  ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1283
    psrld   X0,    10   ; X0 = W[-2] >> 10 {DDCC}
1284
    xor y1, a       ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1285
    ror y0, 6       ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1286
    xor y2, g       ; y2 = CH = ((f^g)&e)^g
1287
    pxor    XTMP2, XTMP3
1288
    ror y1, 2       ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1289
    add y2, y0      ; y2 = S1 + CH
1290
    add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
1291
    pxor    X0, XTMP2   ; X0 = s1 {xDxC}
1292
    mov y0, a       ; y0 = a
1293
    add h, y2       ; h = h + S1 + CH + k + w
1294
    mov y2, a       ; y2 = a
1295
    pshufb  X0, SHUF_DC00   ; X0 = s1 {DC00}
1296
    or  y0, c       ; y0 = a|c
1297
    add d, h        ; d = d + h + S1 + CH + k + w
1298
    and y2, c       ; y2 = a&c
1299
    paddd   X0, XTMP0   ; X0 = {W[3], W[2], W[1], W[0]}
1300
    and y0, b       ; y0 = (a|c)&b
1301
    add h, y1       ; h = h + S1 + CH + k + w + S0
1302
    or  y0, y2      ; y0 = MAJ = (a|c)&b)|(a&c)
1303
    add h, y0       ; h = h + S1 + CH + k + w + S0 + MAJ
1304
1305
ROTATE_ARGS
1306
rotate_Xs
1307
%endm
1308
1309
;; input is [rsp + _XFER + %1 * 4]
1310
%macro DO_ROUND 1
1311
    mov y0, e       ; y0 = e
1312
    ror y0, (25-11) ; y0 = e >> (25-11)
1313
    mov y1, a       ; y1 = a
1314
    xor y0, e       ; y0 = e ^ (e >> (25-11))
1315
    ror y1, (22-13) ; y1 = a >> (22-13)
1316
    mov y2, f       ; y2 = f
1317
    xor y1, a       ; y1 = a ^ (a >> (22-13)
1318
    ror y0, (11-6)  ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1319
    xor y2, g       ; y2 = f^g
1320
    xor y0, e       ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1321
    ror y1, (13-2)  ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1322
    and y2, e       ; y2 = (f^g)&e
1323
    xor y1, a       ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1324
    ror y0, 6       ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1325
    xor y2, g       ; y2 = CH = ((f^g)&e)^g
1326
    add y2, y0      ; y2 = S1 + CH
1327
    ror y1, 2       ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1328
    add y2, [rsp + _XFER + %1 * 4]  ; y2 = k + w + S1 + CH
1329
    mov y0, a       ; y0 = a
1330
    add h, y2       ; h = h + S1 + CH + k + w
1331
    mov y2, a       ; y2 = a
1332
    or  y0, c       ; y0 = a|c
1333
    add d, h        ; d = d + h + S1 + CH + k + w
1334
    and y2, c       ; y2 = a&c
1335
    and y0, b       ; y0 = (a|c)&b
1336
    add h, y1       ; h = h + S1 + CH + k + w + S0
1337
    or  y0, y2      ; y0 = MAJ = (a|c)&b)|(a&c)
1338
    add h, y0       ; h = h + S1 + CH + k + w + S0 + MAJ
1339
    ROTATE_ARGS
1340
%endm
1341
1342
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1343
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1344
;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks)
1345
;; arg 1 : pointer to input data
1346
;; arg 2 : pointer to digest
1347
;; arg 3 : Num blocks
1348
section .text
1349
global sha256_sse4
1350
align 32
1351
sha256_sse4:
1352
    push    rbx
1353
%ifndef LINUX
1354
    push    rsi
1355
    push    rdi
1356
%endif
1357
    push    rbp
1358
    push    r13
1359
    push    r14
1360
    push    r15
1361
1362
    sub rsp,STACK_SIZE
1363
%ifndef LINUX
1364
    movdqa  [rsp + _XMM_SAVE + 0*16],xmm6
1365
    movdqa  [rsp + _XMM_SAVE + 1*16],xmm7
1366
    movdqa  [rsp + _XMM_SAVE + 2*16],xmm8
1367
    movdqa  [rsp + _XMM_SAVE + 3*16],xmm9
1368
    movdqa  [rsp + _XMM_SAVE + 4*16],xmm10
1369
    movdqa  [rsp + _XMM_SAVE + 5*16],xmm11
1370
    movdqa  [rsp + _XMM_SAVE + 6*16],xmm12
1371
%endif
1372
1373
    shl NUM_BLKS, 6 ; convert to bytes
1374
    jz  done_hash
1375
    add NUM_BLKS, INP   ; pointer to end of data
1376
    mov [rsp + _INP_END], NUM_BLKS
1377
1378
    ;; load initial digest
1379
    mov a,[4*0 + CTX]
1380
    mov b,[4*1 + CTX]
1381
    mov c,[4*2 + CTX]
1382
    mov d,[4*3 + CTX]
1383
    mov e,[4*4 + CTX]
1384
    mov f,[4*5 + CTX]
1385
    mov g,[4*6 + CTX]
1386
    mov h,[4*7 + CTX]
1387
1388
    movdqa  BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
1389
    movdqa  SHUF_00BA, [_SHUF_00BA wrt rip]
1390
    movdqa  SHUF_DC00, [_SHUF_DC00 wrt rip]
1391
1392
loop0:
1393
    lea TBL,[K256 wrt rip]
1394
1395
    ;; byte swap first 16 dwords
1396
    COPY_XMM_AND_BSWAP  X0, [INP + 0*16], BYTE_FLIP_MASK
1397
    COPY_XMM_AND_BSWAP  X1, [INP + 1*16], BYTE_FLIP_MASK
1398
    COPY_XMM_AND_BSWAP  X2, [INP + 2*16], BYTE_FLIP_MASK
1399
    COPY_XMM_AND_BSWAP  X3, [INP + 3*16], BYTE_FLIP_MASK
1400
1401
    mov [rsp + _INP], INP
1402
1403
    ;; schedule 48 input dwords, by doing 3 rounds of 16 each
1404
    mov SRND, 3
1405
align 16
1406
loop1:
1407
    movdqa  XFER, [TBL + 0*16]
1408
    paddd   XFER, X0
1409
    movdqa  [rsp + _XFER], XFER
1410
    FOUR_ROUNDS_AND_SCHED
1411
1412
    movdqa  XFER, [TBL + 1*16]
1413
    paddd   XFER, X0
1414
    movdqa  [rsp + _XFER], XFER
1415
    FOUR_ROUNDS_AND_SCHED
1416
1417
    movdqa  XFER, [TBL + 2*16]
1418
    paddd   XFER, X0
1419
    movdqa  [rsp + _XFER], XFER
1420
    FOUR_ROUNDS_AND_SCHED
1421
1422
    movdqa  XFER, [TBL + 3*16]
1423
    paddd   XFER, X0
1424
    movdqa  [rsp + _XFER], XFER
1425
    add TBL, 4*16
1426
    FOUR_ROUNDS_AND_SCHED
1427
1428
    sub SRND, 1
1429
    jne loop1
1430
1431
    mov SRND, 2
1432
loop2:
1433
    paddd   X0, [TBL + 0*16]
1434
    movdqa  [rsp + _XFER], X0
1435
    DO_ROUND    0
1436
    DO_ROUND    1
1437
    DO_ROUND    2
1438
    DO_ROUND    3
1439
    paddd   X1, [TBL + 1*16]
1440
    movdqa  [rsp + _XFER], X1
1441
    add TBL, 2*16
1442
    DO_ROUND    0
1443
    DO_ROUND    1
1444
    DO_ROUND    2
1445
    DO_ROUND    3
1446
1447
    movdqa  X0, X2
1448
    movdqa  X1, X3
1449
1450
    sub SRND, 1
1451
    jne loop2
1452
1453
    addm    [4*0 + CTX],a
1454
    addm    [4*1 + CTX],b
1455
    addm    [4*2 + CTX],c
1456
    addm    [4*3 + CTX],d
1457
    addm    [4*4 + CTX],e
1458
    addm    [4*5 + CTX],f
1459
    addm    [4*6 + CTX],g
1460
    addm    [4*7 + CTX],h
1461
1462
    mov INP, [rsp + _INP]
1463
    add INP, 64
1464
    cmp INP, [rsp + _INP_END]
1465
    jne loop0
1466
1467
done_hash:
1468
%ifndef LINUX
1469
    movdqa  xmm6,[rsp + _XMM_SAVE + 0*16]
1470
    movdqa  xmm7,[rsp + _XMM_SAVE + 1*16]
1471
    movdqa  xmm8,[rsp + _XMM_SAVE + 2*16]
1472
    movdqa  xmm9,[rsp + _XMM_SAVE + 3*16]
1473
    movdqa  xmm10,[rsp + _XMM_SAVE + 4*16]
1474
    movdqa  xmm11,[rsp + _XMM_SAVE + 5*16]
1475
    movdqa  xmm12,[rsp + _XMM_SAVE + 6*16]
1476
%endif
1477
1478
    add rsp, STACK_SIZE
1479
1480
    pop r15
1481
    pop r14
1482
    pop r13
1483
    pop rbp
1484
%ifndef LINUX
1485
    pop rdi
1486
    pop rsi
1487
%endif
1488
    pop rbx
1489
1490
    ret
1491
1492
1493
section .data
1494
align 64
1495
K256:
1496
    dd  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1497
    dd  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1498
    dd  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1499
    dd  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1500
    dd  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1501
    dd  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1502
    dd  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1503
    dd  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1504
    dd  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1505
    dd  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1506
    dd  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1507
    dd  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1508
    dd  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1509
    dd  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1510
    dd  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1511
    dd  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1512
1513
PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
1514
1515
; shuffle xBxA -> 00BA
1516
_SHUF_00BA:              ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
1517
1518
; shuffle xDxC -> DC00
1519
_SHUF_DC00:              ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
1520
*/
1521
1522
#endif