Annotation of embedaddon/strongswan/src/libstrongswan/plugins/curve25519/ref10/ref10.c, revision 1.1.1.1

1.1       misho       1: /*
                      2:  * Copyright (C) 2016 Andreas Steffen
                      3:  * HSR Hochschule fuer Technik Rapperswil
                      4:  *
                      5:  * Based on the public domain libsodium adaptation by Frank Denis
                      6:  * of the SUPERCOP ref10 implementation by  Daniel J. Bernstein,
                      7:  * Niels Duif, Peter Schwabe, Tanja Lange and Bo-Yin Yang.
                      8:  */
                      9: 
                     10: #include <stddef.h>
                     11: #include <stdint.h>
                     12: #include <string.h>
                     13: 
                     14: #include "ref10.h"
                     15: 
                     16: #include <utils/utils.h>
                     17: 
                     18: static uint64_t load_3(const uint8_t *in)
                     19: {
                     20:        uint64_t result;
                     21: 
                     22:        result = (uint64_t) in[0];
                     23:        result |= ((uint64_t) in[1]) << 8;
                     24:        result |= ((uint64_t) in[2]) << 16;
                     25: 
                     26:        return result;
                     27: }
                     28: 
                     29: static uint64_t load_4(const uint8_t *in)
                     30: {
                     31:        uint64_t result;
                     32: 
                     33:        result = (uint64_t) in[0];
                     34:        result |= ((uint64_t) in[1]) << 8;
                     35:        result |= ((uint64_t) in[2]) << 16;
                     36:        result |= ((uint64_t) in[3]) << 24;
                     37: 
                     38:        return result;
                     39: }
                     40: 
                     41: /**
                     42:  * h = 0
                     43:  */
                     44: static void fe_0(fe h)
                     45: {
                     46:        memset(&h[0], 0, 10 * sizeof h[0]);
                     47: }
                     48: 
                     49: /**
                     50:  * h = 1
                     51:  */
                     52: static void fe_1(fe h)
                     53: {
                     54:        h[0] = 1;
                     55:        h[1] = 0;
                     56:        memset(&h[2], 0, 8 * sizeof h[0]);
                     57: }
                     58: 
                     59: /**
                     60:  * h = f + g
                     61:  * Can overlap h with f or g.
                     62:  *
                     63:  * Preconditions:
                     64:  * |f| bounded by 1.1*2^25, 1.1*2^24, 1.1*2^25, 1.1*2^24, etc.
                     65:  * |g| bounded by 1.1*2^25, 1.1*2^24, 1.1*2^25, 1.1*2^24, etc.
                     66:  *
                     67:  * Postconditions:
                     68:  * |h| bounded by 1.1*2^26, 1.1*2^25, 1.1*2^26, 1.1*2^25, etc.
                     69:  */
                     70: static void fe_add(fe h, const fe f, const fe g)
                     71: {
                     72:        int32_t f0 = f[0];
                     73:        int32_t f1 = f[1];
                     74:        int32_t f2 = f[2];
                     75:        int32_t f3 = f[3];
                     76:        int32_t f4 = f[4];
                     77:        int32_t f5 = f[5];
                     78:        int32_t f6 = f[6];
                     79:        int32_t f7 = f[7];
                     80:        int32_t f8 = f[8];
                     81:        int32_t f9 = f[9];
                     82: 
                     83:        int32_t g0 = g[0];
                     84:        int32_t g1 = g[1];
                     85:        int32_t g2 = g[2];
                     86:        int32_t g3 = g[3];
                     87:        int32_t g4 = g[4];
                     88:        int32_t g5 = g[5];
                     89:        int32_t g6 = g[6];
                     90:        int32_t g7 = g[7];
                     91:        int32_t g8 = g[8];
                     92:        int32_t g9 = g[9];
                     93: 
                     94:        int32_t h0 = f0 + g0;
                     95:        int32_t h1 = f1 + g1;
                     96:        int32_t h2 = f2 + g2;
                     97:        int32_t h3 = f3 + g3;
                     98:        int32_t h4 = f4 + g4;
                     99:        int32_t h5 = f5 + g5;
                    100:        int32_t h6 = f6 + g6;
                    101:        int32_t h7 = f7 + g7;
                    102:        int32_t h8 = f8 + g8;
                    103:        int32_t h9 = f9 + g9;
                    104: 
                    105:        h[0] = h0;
                    106:        h[1] = h1;
                    107:        h[2] = h2;
                    108:        h[3] = h3;
                    109:        h[4] = h4;
                    110:        h[5] = h5;
                    111:        h[6] = h6;
                    112:        h[7] = h7;
                    113:        h[8] = h8;
                    114:        h[9] = h9;
                    115: }
                    116: 
                    117: /**
                    118:  * Replace (f,g) with (g,g) if b == 1;
                    119:  * replace (f,g) with (f,g) if b == 0.
                    120:  *
                    121:  * Preconditions: b in {0,1}.
                    122:  */
                    123: static void fe_cmov(fe f, const fe g, unsigned int b)
                    124: {
                    125:        int32_t f0 = f[0];
                    126:        int32_t f1 = f[1];
                    127:        int32_t f2 = f[2];
                    128:        int32_t f3 = f[3];
                    129:        int32_t f4 = f[4];
                    130:        int32_t f5 = f[5];
                    131:        int32_t f6 = f[6];
                    132:        int32_t f7 = f[7];
                    133:        int32_t f8 = f[8];
                    134:        int32_t f9 = f[9];
                    135: 
                    136:        int32_t g0 = g[0];
                    137:        int32_t g1 = g[1];
                    138:        int32_t g2 = g[2];
                    139:        int32_t g3 = g[3];
                    140:        int32_t g4 = g[4];
                    141:        int32_t g5 = g[5];
                    142:        int32_t g6 = g[6];
                    143:        int32_t g7 = g[7];
                    144:        int32_t g8 = g[8];
                    145:        int32_t g9 = g[9];
                    146: 
                    147:        int32_t x0 = f0 ^ g0;
                    148:        int32_t x1 = f1 ^ g1;
                    149:        int32_t x2 = f2 ^ g2;
                    150:        int32_t x3 = f3 ^ g3;
                    151:        int32_t x4 = f4 ^ g4;
                    152:        int32_t x5 = f5 ^ g5;
                    153:        int32_t x6 = f6 ^ g6;
                    154:        int32_t x7 = f7 ^ g7;
                    155:        int32_t x8 = f8 ^ g8;
                    156:        int32_t x9 = f9 ^ g9;
                    157: 
                    158:        b = (unsigned int) (- (int) b);
                    159: 
                    160:        x0 &= b;
                    161:        x1 &= b;
                    162:        x2 &= b;
                    163:        x3 &= b;
                    164:        x4 &= b;
                    165:        x5 &= b;
                    166:        x6 &= b;
                    167:        x7 &= b;
                    168:        x8 &= b;
                    169:        x9 &= b;
                    170: 
                    171:        f[0] = f0 ^ x0;
                    172:        f[1] = f1 ^ x1;
                    173:        f[2] = f2 ^ x2;
                    174:        f[3] = f3 ^ x3;
                    175:        f[4] = f4 ^ x4;
                    176:        f[5] = f5 ^ x5;
                    177:        f[6] = f6 ^ x6;
                    178:        f[7] = f7 ^ x7;
                    179:        f[8] = f8 ^ x8;
                    180:        f[9] = f9 ^ x9;
                    181: }
                    182: 
                    183: /**
                    184:  * h = f
                    185:  */
                    186: static void fe_copy(fe h, const fe f)
                    187: {
                    188:        int32_t f0 = f[0];
                    189:        int32_t f1 = f[1];
                    190:        int32_t f2 = f[2];
                    191:        int32_t f3 = f[3];
                    192:        int32_t f4 = f[4];
                    193:        int32_t f5 = f[5];
                    194:        int32_t f6 = f[6];
                    195:        int32_t f7 = f[7];
                    196:        int32_t f8 = f[8];
                    197:        int32_t f9 = f[9];
                    198: 
                    199:        h[0] = f0;
                    200:        h[1] = f1;
                    201:        h[2] = f2;
                    202:        h[3] = f3;
                    203:        h[4] = f4;
                    204:        h[5] = f5;
                    205:        h[6] = f6;
                    206:        h[7] = f7;
                    207:        h[8] = f8;
                    208:        h[9] = f9;
                    209: }
                    210: 
                    211: /**
                    212:  * Ignores top bit of h.
                    213:  */
                    214: static void fe_frombytes(fe h, const uint8_t *s)
                    215: {
                    216:        int64_t h0 = load_4(s);
                    217:        int64_t h1 = load_3(s + 4) << 6;
                    218:        int64_t h2 = load_3(s + 7) << 5;
                    219:        int64_t h3 = load_3(s + 10) << 3;
                    220:        int64_t h4 = load_3(s + 13) << 2;
                    221:        int64_t h5 = load_4(s + 16);
                    222:        int64_t h6 = load_3(s + 20) << 7;
                    223:        int64_t h7 = load_3(s + 23) << 5;
                    224:        int64_t h8 = load_3(s + 26) << 4;
                    225:        int64_t h9 = (load_3(s + 29) & 8388607) << 2;
                    226: 
                    227:        int64_t carry0, carry1, carry2, carry3, carry4;
                    228:        int64_t carry5, carry6, carry7, carry8, carry9;
                    229: 
                    230:        carry9 = (h9 + (int64_t) (1L << 24)) >> 25;
                    231:        h0 += carry9 * 19;
                    232:        h9 -= carry9 * ((uint64_t) 1L << 25);
                    233: 
                    234:        carry1 = (h1 + (int64_t) (1L << 24)) >> 25;
                    235:        h2 += carry1;
                    236:        h1 -= carry1 * ((uint64_t) 1L << 25);
                    237: 
                    238:        carry3 = (h3 + (int64_t) (1L << 24)) >> 25;
                    239:        h4 += carry3;
                    240:        h3 -= carry3 * ((uint64_t) 1L << 25);
                    241: 
                    242:        carry5 = (h5 + (int64_t) (1L << 24)) >> 25;
                    243:        h6 += carry5;
                    244:        h5 -= carry5 * ((uint64_t) 1L << 25);
                    245: 
                    246:        carry7 = (h7 + (int64_t) (1L << 24)) >> 25;
                    247:        h8 += carry7;
                    248:        h7 -= carry7 * ((uint64_t) 1L << 25);
                    249: 
                    250:        carry0 = (h0 + (int64_t) (1L << 25)) >> 26;
                    251:        h1 += carry0;
                    252:        h0 -= carry0 * ((uint64_t) 1L << 26);
                    253: 
                    254:        carry2 = (h2 + (int64_t) (1L << 25)) >> 26;
                    255:        h3 += carry2;
                    256:        h2 -= carry2 * ((uint64_t) 1L << 26);
                    257: 
                    258:        carry4 = (h4 + (int64_t) (1L << 25)) >> 26;
                    259:        h5 += carry4;
                    260:        h4 -= carry4 * ((uint64_t) 1L << 26);
                    261: 
                    262:        carry6 = (h6 + (int64_t) (1L << 25)) >> 26;
                    263:        h7 += carry6;
                    264:        h6 -= carry6 * ((uint64_t) 1L << 26);
                    265: 
                    266:        carry8 = (h8 + (int64_t) (1L << 25)) >> 26;
                    267:        h9 += carry8;
                    268:        h8 -= carry8 * ((uint64_t) 1L << 26);
                    269: 
                    270:        h[0] = (int32_t) h0;
                    271:        h[1] = (int32_t) h1;
                    272:        h[2] = (int32_t) h2;
                    273:        h[3] = (int32_t) h3;
                    274:        h[4] = (int32_t) h4;
                    275:        h[5] = (int32_t) h5;
                    276:        h[6] = (int32_t) h6;
                    277:        h[7] = (int32_t) h7;
                    278:        h[8] = (int32_t) h8;
                    279:        h[9] = (int32_t) h9;
                    280: }
                    281: 
                    282: /**
                    283:  * Preconditions:
                    284:  * |h| bounded by 1.1*2^26, 1.1*2^25, 1.1*2^26, 1.1*2^25, etc.
                    285:  *
                    286:  * Write p=2^255-19; q=floor(h/p).
                    287:  * Basic claim: q = floor(2^(-255)(h + 19 2^(-25)h9 + 2^(-1))).
                    288:  *
                    289:  * Proof:
                    290:  * Have |h|<=p so |q|<=1 so |19^2 2^(-255) q|<1/4.
                    291:  * Also have |h-2^230 h9|<2^231 so |19 2^(-255)(h-2^230 h9)|<1/4.
                    292:  *
                    293:  * Write y=2^(-1)-19^2 2^(-255)q-19 2^(-255)(h-2^230 h9).
                    294:  * Then 0<y<1.
                    295:  *
                    296:  * Write r=h-pq.
                    297:  * Have 0<=r<=p-1=2^255-20.
                    298:  * Thus 0<=r+19(2^-255)r<r+19(2^-255)2^255<=2^255-1.
                    299:  *
                    300:  * Write x=r+19(2^-255)r+y.
                    301:  * Then 0<x<2^255 so floor(2^(-255)x) = 0 so floor(q+2^(-255)x) = q.
                    302:  *
                    303:  * Have q+2^(-255)x = 2^(-255)(h + 19 2^(-25) h9 + 2^(-1))
                    304:  * so floor(2^(-255)(h + 19 2^(-25) h9 + 2^(-1))) = q.
                    305:  */
                    306: static void fe_tobytes(uint8_t *s, const fe h)
                    307: {
                    308:        int32_t h0 = h[0];
                    309:        int32_t h1 = h[1];
                    310:        int32_t h2 = h[2];
                    311:        int32_t h3 = h[3];
                    312:        int32_t h4 = h[4];
                    313:        int32_t h5 = h[5];
                    314:        int32_t h6 = h[6];
                    315:        int32_t h7 = h[7];
                    316:        int32_t h8 = h[8];
                    317:        int32_t h9 = h[9];
                    318: 
                    319:        int32_t carry0, carry1, carry2, carry3, carry4;
                    320:        int32_t carry5, carry6, carry7, carry8, carry9;
                    321:        int32_t q;
                    322: 
                    323:        q = (19 * h9 + ((uint32_t) 1L << 24)) >> 25;
                    324:        q = (h0 + q) >> 26;
                    325:        q = (h1 + q) >> 25;
                    326:        q = (h2 + q) >> 26;
                    327:        q = (h3 + q) >> 25;
                    328:        q = (h4 + q) >> 26;
                    329:        q = (h5 + q) >> 25;
                    330:        q = (h6 + q) >> 26;
                    331:        q = (h7 + q) >> 25;
                    332:        q = (h8 + q) >> 26;
                    333:        q = (h9 + q) >> 25;
                    334: 
                    335:        /* Goal: Output h-(2^255-19)q, which is between 0 and 2^255-20. */
                    336:        h0 += 19 * q;
                    337:        /* Goal: Output h-2^255 q, which is between 0 and 2^255-20. */
                    338: 
                    339:        carry0 = h0 >> 26;
                    340:        h1 += carry0;
                    341:        h0 -= carry0 * ((uint32_t) 1L << 26);
                    342: 
                    343:        carry1 = h1 >> 25;
                    344:        h2 += carry1;
                    345:        h1 -= carry1 * ((uint32_t) 1L << 25);
                    346: 
                    347:        carry2 = h2 >> 26;
                    348:        h3 += carry2;
                    349:        h2 -= carry2 * ((uint32_t) 1L << 26);
                    350: 
                    351:        carry3 = h3 >> 25;
                    352:        h4 += carry3;
                    353:        h3 -= carry3 * ((uint32_t) 1L << 25);
                    354: 
                    355:        carry4 = h4 >> 26;
                    356:        h5 += carry4;
                    357:        h4 -= carry4 * ((uint32_t) 1L << 26);
                    358: 
                    359:        carry5 = h5 >> 25;
                    360:        h6 += carry5;
                    361:        h5 -= carry5 * ((uint32_t) 1L << 25);
                    362: 
                    363:        carry6 = h6 >> 26;
                    364:        h7 += carry6;
                    365:        h6 -= carry6 * ((uint32_t) 1L << 26);
                    366: 
                    367:        carry7 = h7 >> 25;
                    368:        h8 += carry7;
                    369:        h7 -= carry7 * ((uint32_t) 1L << 25);
                    370: 
                    371:        carry8 = h8 >> 26;
                    372:        h9 += carry8;
                    373:        h8 -= carry8 * ((uint32_t) 1L << 26);
                    374: 
                    375:        carry9 = h9 >> 25;
                    376:        h9 -= carry9 * ((uint32_t) 1L << 25);
                    377:        /* h10 = carry9 */
                    378: 
                    379:        /**
                    380:         * Goal: Output h0+...+2^255 h10-2^255 q, which is between 0 and 2^255-20.
                    381:         * Have h0+...+2^230 h9 between 0 and 2^255-1;
                    382:         * evidently 2^255 h10-2^255 q = 0.
                    383:         * Goal: Output h0+...+2^230 h9.
                    384:         */
                    385:        s[0]  = h0 >> 0;
                    386:        s[1]  = h0 >> 8;
                    387:        s[2]  = h0 >> 16;
                    388:        s[3]  = (h0 >> 24) | (h1 * ((uint32_t) 1 << 2));
                    389:        s[4]  = h1 >> 6;
                    390:        s[5]  = h1 >> 14;
                    391:        s[6]  = (h1 >> 22) | (h2 * ((uint32_t) 1 << 3));
                    392:        s[7]  = h2 >> 5;
                    393:        s[8]  = h2 >> 13;
                    394:        s[9]  = (h2 >> 21) | (h3 * ((uint32_t) 1 << 5));
                    395:        s[10] = h3 >> 3;
                    396:        s[11] = h3 >> 11;
                    397:        s[12] = (h3 >> 19) | (h4 * ((uint32_t) 1 << 6));
                    398:        s[13] = h4 >> 2;
                    399:        s[14] = h4 >> 10;
                    400:        s[15] = h4 >> 18;
                    401:        s[16] = h5 >> 0;
                    402:        s[17] = h5 >> 8;
                    403:        s[18] = h5 >> 16;
                    404:        s[19] = (h5 >> 24) | (h6 * ((uint32_t) 1 << 1));
                    405:        s[20] = h6 >> 7;
                    406:        s[21] = h6 >> 15;
                    407:        s[22] = (h6 >> 23) | (h7 * ((uint32_t) 1 << 3));
                    408:        s[23] = h7 >> 5;
                    409:        s[24] = h7 >> 13;
                    410:        s[25] = (h7 >> 21) | (h8 * ((uint32_t) 1 << 4));
                    411:        s[26] = h8 >> 4;
                    412:        s[27] = h8 >> 12;
                    413:        s[28] = (h8 >> 20) | (h9 * ((uint32_t) 1 << 6));
                    414:        s[29] = h9 >> 2;
                    415:        s[30] = h9 >> 10;
                    416:        s[31] = h9 >> 18;
                    417: }
                    418: 
                    419: /**
                    420:  * return 1 if f is in {1,3,5,...,q-2}
                    421:  * return 0 if f is in {0,2,4,...,q-1}
                    422:  *
                    423:  * Preconditions:
                    424:  * |f| bounded by 1.1*2^26, 1.1*2^25, 1.1*2^26, 1.1*2^25, etc.
                    425:  */
                    426: static int fe_isnegative(const fe f)
                    427: {
                    428:        uint8_t s[32];
                    429: 
                    430:        fe_tobytes(s,f);
                    431: 
                    432:        return s[0] & 1;
                    433: }
                    434: 
                    435: /**
                    436:  * return 1 if f != 0
                    437:  * return 0 if f == 0
                    438:  *
                    439:  * Preconditions:
                    440:  * |f| bounded by 1.1*2^26, 1.1*2^25, 1.1*2^26, 1.1*2^25,etc.
                    441:  */
                    442: static uint8_t zero[32];
                    443: 
                    444: static int fe_isnonzero(const fe f)
                    445: {
                    446:        uint8_t s[32];
                    447: 
                    448:        fe_tobytes(s, f);
                    449: 
                    450:        return !memeq_const(s, zero, 32);
                    451: }
                    452: 
                    453: /**
                    454:  * h = f * g
                    455:  * Can overlap h with f or g.
                    456:  *
                    457:  * Preconditions:
                    458:  * |f| bounded by 1.65*2^26, 1.65*2^25, 1.65*2^26, 1.65*2^25, etc.
                    459:  * |g| bounded by 1.65*2^26, 1.65*2^25, 1.65*2^26, 1.65*2^25, etc.
                    460:  *
                    461:  * Postconditions:
                    462:  * |h| bounded by 1.01*2^25, 1.01*2^24, 1.01*2^25, 1.01*2^24, etc.
                    463:  */
                    464: 
                    465: /**
                    466:  * Notes on implementation strategy:
                    467:  *
                    468:  * Using schoolbook multiplication.
                    469:  * Karatsuba would save a little in some cost models.
                    470:  *
                    471:  * Most multiplications by 2 and 19 are 32-bit precomputations;
                    472:  * cheaper than 64-bit postcomputations.
                    473:  *
                    474:  * There is one remaining multiplication by 19 in the carry chain;
                    475:  * one *19 precomputation can be merged into this,
                    476:  * but the resulting data flow is considerably less clean.
                    477:  *
                    478:  * There are 12 carries below.
                    479:  * 10 of them are 2-way parallelizable and vectorizable.
                    480:  * Can get away with 11 carries, but then data flow is much deeper.
                    481:  *
                    482:  * With tighter constraints on inputs can squeeze carries into int32.
                    483:  */
                    484: 
                    485: static void fe_mul(fe h, const fe f, const fe g)
                    486: {
                    487:        int32_t f0 = f[0];
                    488:        int32_t f1 = f[1];
                    489:        int32_t f2 = f[2];
                    490:        int32_t f3 = f[3];
                    491:        int32_t f4 = f[4];
                    492:        int32_t f5 = f[5];
                    493:        int32_t f6 = f[6];
                    494:        int32_t f7 = f[7];
                    495:        int32_t f8 = f[8];
                    496:        int32_t f9 = f[9];
                    497: 
                    498:        int32_t g0 = g[0];
                    499:        int32_t g1 = g[1];
                    500:        int32_t g2 = g[2];
                    501:        int32_t g3 = g[3];
                    502:        int32_t g4 = g[4];
                    503:        int32_t g5 = g[5];
                    504:        int32_t g6 = g[6];
                    505:        int32_t g7 = g[7];
                    506:        int32_t g8 = g[8];
                    507:        int32_t g9 = g[9];
                    508: 
                    509:        int32_t g1_19 = 19 * g1; /* 1.959375*2^29 */
                    510:        int32_t g2_19 = 19 * g2; /* 1.959375*2^30; still ok */
                    511:        int32_t g3_19 = 19 * g3;
                    512:        int32_t g4_19 = 19 * g4;
                    513:        int32_t g5_19 = 19 * g5;
                    514:        int32_t g6_19 = 19 * g6;
                    515:        int32_t g7_19 = 19 * g7;
                    516:        int32_t g8_19 = 19 * g8;
                    517:        int32_t g9_19 = 19 * g9;
                    518: 
                    519:        int32_t f1_2 = 2 * f1;
                    520:        int32_t f3_2 = 2 * f3;
                    521:        int32_t f5_2 = 2 * f5;
                    522:        int32_t f7_2 = 2 * f7;
                    523:        int32_t f9_2 = 2 * f9;
                    524: 
                    525:        int64_t f0g0    = f0   * (int64_t) g0;
                    526:        int64_t f0g1    = f0   * (int64_t) g1;
                    527:        int64_t f0g2    = f0   * (int64_t) g2;
                    528:        int64_t f0g3    = f0   * (int64_t) g3;
                    529:        int64_t f0g4    = f0   * (int64_t) g4;
                    530:        int64_t f0g5    = f0   * (int64_t) g5;
                    531:        int64_t f0g6    = f0   * (int64_t) g6;
                    532:        int64_t f0g7    = f0   * (int64_t) g7;
                    533:        int64_t f0g8    = f0   * (int64_t) g8;
                    534:        int64_t f0g9    = f0   * (int64_t) g9;
                    535: 
                    536:        int64_t f1g0    = f1   * (int64_t) g0;
                    537:        int64_t f1g1_2  = f1_2 * (int64_t) g1;
                    538:        int64_t f1g2    = f1   * (int64_t) g2;
                    539:        int64_t f1g3_2  = f1_2 * (int64_t) g3;
                    540:        int64_t f1g4    = f1   * (int64_t) g4;
                    541:        int64_t f1g5_2  = f1_2 * (int64_t) g5;
                    542:        int64_t f1g6    = f1   * (int64_t) g6;
                    543:        int64_t f1g7_2  = f1_2 * (int64_t) g7;
                    544:        int64_t f1g8    = f1   * (int64_t) g8;
                    545:        int64_t f1g9_38 = f1_2 * (int64_t) g9_19;
                    546: 
                    547:        int64_t f2g0    = f2   * (int64_t) g0;
                    548:        int64_t f2g1    = f2   * (int64_t) g1;
                    549:        int64_t f2g2    = f2   * (int64_t) g2;
                    550:        int64_t f2g3    = f2   * (int64_t) g3;
                    551:        int64_t f2g4    = f2   * (int64_t) g4;
                    552:        int64_t f2g5    = f2   * (int64_t) g5;
                    553:        int64_t f2g6    = f2   * (int64_t) g6;
                    554:        int64_t f2g7    = f2   * (int64_t) g7;
                    555:        int64_t f2g8_19 = f2   * (int64_t) g8_19;
                    556:        int64_t f2g9_19 = f2   * (int64_t) g9_19;
                    557: 
                    558:        int64_t f3g0    = f3   * (int64_t) g0;
                    559:        int64_t f3g1_2  = f3_2 * (int64_t) g1;
                    560:        int64_t f3g2    = f3   * (int64_t) g2;
                    561:        int64_t f3g3_2  = f3_2 * (int64_t) g3;
                    562:        int64_t f3g4    = f3   * (int64_t) g4;
                    563:        int64_t f3g5_2  = f3_2 * (int64_t) g5;
                    564:        int64_t f3g6    = f3   * (int64_t) g6;
                    565:        int64_t f3g7_38 = f3_2 * (int64_t) g7_19;
                    566:        int64_t f3g8_19 = f3   * (int64_t) g8_19;
                    567:        int64_t f3g9_38 = f3_2 * (int64_t) g9_19;
                    568: 
                    569:        int64_t f4g0    = f4   * (int64_t) g0;
                    570:        int64_t f4g1    = f4   * (int64_t) g1;
                    571:        int64_t f4g2    = f4   * (int64_t) g2;
                    572:        int64_t f4g3    = f4   * (int64_t) g3;
                    573:        int64_t f4g4    = f4   * (int64_t) g4;
                    574:        int64_t f4g5    = f4   * (int64_t) g5;
                    575:        int64_t f4g6_19 = f4   * (int64_t) g6_19;
                    576:        int64_t f4g7_19 = f4   * (int64_t) g7_19;
                    577:        int64_t f4g8_19 = f4   * (int64_t) g8_19;
                    578:        int64_t f4g9_19 = f4   * (int64_t) g9_19;
                    579: 
                    580:        int64_t f5g0    = f5   * (int64_t) g0;
                    581:        int64_t f5g1_2  = f5_2 * (int64_t) g1;
                    582:        int64_t f5g2    = f5   * (int64_t) g2;
                    583:        int64_t f5g3_2  = f5_2 * (int64_t) g3;
                    584:        int64_t f5g4    = f5   * (int64_t) g4;
                    585:        int64_t f5g5_38 = f5_2 * (int64_t) g5_19;
                    586:        int64_t f5g6_19 = f5   * (int64_t) g6_19;
                    587:        int64_t f5g7_38 = f5_2 * (int64_t) g7_19;
                    588:        int64_t f5g8_19 = f5   * (int64_t) g8_19;
                    589:        int64_t f5g9_38 = f5_2 * (int64_t) g9_19;
                    590: 
                    591:        int64_t f6g0    = f6   * (int64_t) g0;
                    592:        int64_t f6g1    = f6   * (int64_t) g1;
                    593:        int64_t f6g2    = f6   * (int64_t) g2;
                    594:        int64_t f6g3    = f6   * (int64_t) g3;
                    595:        int64_t f6g4_19 = f6   * (int64_t) g4_19;
                    596:        int64_t f6g5_19 = f6   * (int64_t) g5_19;
                    597:        int64_t f6g6_19 = f6   * (int64_t) g6_19;
                    598:        int64_t f6g7_19 = f6   * (int64_t) g7_19;
                    599:        int64_t f6g8_19 = f6   * (int64_t) g8_19;
                    600:        int64_t f6g9_19 = f6   * (int64_t) g9_19;
                    601: 
                    602:        int64_t f7g0    = f7   * (int64_t) g0;
                    603:        int64_t f7g1_2  = f7_2 * (int64_t) g1;
                    604:        int64_t f7g2    = f7   * (int64_t) g2;
                    605:        int64_t f7g3_38 = f7_2 * (int64_t) g3_19;
                    606:        int64_t f7g4_19 = f7   * (int64_t) g4_19;
                    607:        int64_t f7g5_38 = f7_2 * (int64_t) g5_19;
                    608:        int64_t f7g6_19 = f7   * (int64_t) g6_19;
                    609:        int64_t f7g7_38 = f7_2 * (int64_t) g7_19;
                    610:        int64_t f7g8_19 = f7   * (int64_t) g8_19;
                    611:        int64_t f7g9_38 = f7_2 * (int64_t) g9_19;
                    612: 
                    613:        int64_t f8g0    = f8   * (int64_t) g0;
                    614:        int64_t f8g1    = f8   * (int64_t) g1;
                    615:        int64_t f8g2_19 = f8   * (int64_t) g2_19;
                    616:        int64_t f8g3_19 = f8   * (int64_t) g3_19;
                    617:        int64_t f8g4_19 = f8   * (int64_t) g4_19;
                    618:        int64_t f8g5_19 = f8   * (int64_t) g5_19;
                    619:        int64_t f8g6_19 = f8   * (int64_t) g6_19;
                    620:        int64_t f8g7_19 = f8   * (int64_t) g7_19;
                    621:        int64_t f8g8_19 = f8   * (int64_t) g8_19;
                    622:        int64_t f8g9_19 = f8   * (int64_t) g9_19;
                    623: 
                    624:        int64_t f9g0    = f9   * (int64_t) g0;
                    625:        int64_t f9g1_38 = f9_2 * (int64_t) g1_19;
                    626:        int64_t f9g2_19 = f9   * (int64_t) g2_19;
                    627:        int64_t f9g3_38 = f9_2 * (int64_t) g3_19;
                    628:        int64_t f9g4_19 = f9   * (int64_t) g4_19;
                    629:        int64_t f9g5_38 = f9_2 * (int64_t) g5_19;
                    630:        int64_t f9g6_19 = f9   * (int64_t) g6_19;
                    631:        int64_t f9g7_38 = f9_2 * (int64_t) g7_19;
                    632:        int64_t f9g8_19 = f9   * (int64_t) g8_19;
                    633:        int64_t f9g9_38 = f9_2 * (int64_t) g9_19;
                    634: 
                    635:        int64_t h0 = f0g0    + f1g9_38 + f2g8_19 + f3g7_38 + f4g6_19 + f5g5_38 +
                    636:                     f6g4_19 + f7g3_38 + f8g2_19 + f9g1_38;
                    637:        int64_t h1 = f0g1    + f1g0    + f2g9_19 + f3g8_19 + f4g7_19 + f5g6_19 +
                    638:                     f6g5_19 + f7g4_19 + f8g3_19 + f9g2_19;
                    639:        int64_t h2 = f0g2    + f1g1_2  + f2g0    + f3g9_38 + f4g8_19 + f5g7_38 +
                    640:                     f6g6_19 + f7g5_38 + f8g4_19 + f9g3_38;
                    641:        int64_t h3 = f0g3    + f1g2    + f2g1    + f3g0    + f4g9_19 + f5g8_19 +
                    642:                     f6g7_19 + f7g6_19 + f8g5_19 + f9g4_19;
                    643:        int64_t h4 = f0g4    + f1g3_2  + f2g2    + f3g1_2  + f4g0    + f5g9_38 +
                    644:                     f6g8_19 + f7g7_38 + f8g6_19 + f9g5_38;
                    645:        int64_t h5 = f0g5    + f1g4    + f2g3    + f3g2    + f4g1    + f5g0    +
                    646:                     f6g9_19 + f7g8_19 + f8g7_19 + f9g6_19;
                    647:        int64_t h6 = f0g6    + f1g5_2  + f2g4    + f3g3_2  + f4g2    + f5g1_2  +
                    648:                     f6g0    + f7g9_38 + f8g8_19 + f9g7_38;
                    649:        int64_t h7 = f0g7    + f1g6    + f2g5    + f3g4    + f4g3    + f5g2    +
                    650:                     f6g1    + f7g0    + f8g9_19 + f9g8_19;
                    651:        int64_t h8 = f0g8    + f1g7_2  + f2g6    + f3g5_2  + f4g4    + f5g3_2  +
                    652:                     f6g2    + f7g1_2  + f8g0    + f9g9_38;
                    653:        int64_t h9 = f0g9    + f1g8    + f2g7    + f3g6    + f4g5    + f5g4    +
                    654:                     f6g3    + f7g2    + f8g1    + f9g0 ;
                    655: 
                    656:        int64_t carry0, carry1, carry2, carry3, carry4;
                    657:        int64_t carry5, carry6, carry7, carry8, carry9;
                    658: 
                    659:        /**
                    660:         * |h0| <= (1.65*1.65*2^52*(1+19+19+19+19)+1.65*1.65*2^50*(38+38+38+38+38))
                    661:         * i.e. |h0| <= 1.4*2^60; narrower ranges for h2, h4, h6, h8
                    662:         * |h1| <= (1.65*1.65*2^51*(1+1+19+19+19+19+19+19+19+19))
                    663:         * i.e. |h1| <= 1.7*2^59; narrower ranges for h3, h5, h7, h9
                    664:         */
                    665: 
                    666:        carry0 = (h0 + (int64_t) (1L << 25)) >> 26;
                    667:        h1 += carry0;
                    668:        h0 -= carry0 * ((uint64_t) 1L << 26);
                    669:        /* |h0| <= 2^25 */
                    670:        /* |h1| <= 1.71*2^59 */
                    671: 
                    672:        carry4 = (h4 + (int64_t) (1L << 25)) >> 26;
                    673:        h5 += carry4;
                    674:        h4 -= carry4 * ((uint64_t) 1L << 26);
                    675:        /* |h4| <= 2^25 */
                    676:        /* |h5| <= 1.71*2^59 */
                    677: 
                    678:        carry1 = (h1 + (int64_t) (1L << 24)) >> 25;
                    679:        h2 += carry1;
                    680:        h1 -= carry1 * ((uint64_t) 1L << 25);
                    681:        /* |h1| <= 2^24; from now on fits into int32 */
                    682:        /* |h2| <= 1.41*2^60 */
                    683: 
                    684:        carry5 = (h5 + (int64_t) (1L << 24)) >> 25;
                    685:        h6 += carry5;
                    686:        h5 -= carry5 * ((uint64_t) 1L << 25);
                    687:        /* |h5| <= 2^24; from now on fits into int32 */
                    688:        /* |h6| <= 1.41*2^60 */
                    689: 
                    690:        carry2 = (h2 + (int64_t) (1L << 25)) >> 26;
                    691:        h3 += carry2;
                    692:        h2 -= carry2 * ((uint64_t) 1L << 26);
                    693:        /* |h2| <= 2^25; from now on fits into int32 unchanged */
                    694:        /* |h3| <= 1.71*2^59 */
                    695: 
                    696:        carry6 = (h6 + (int64_t) (1L << 25)) >> 26;
                    697:        h7 += carry6;
                    698:        h6 -= carry6 * ((uint64_t) 1L << 26);
                    699:        /* |h6| <= 2^25; from now on fits into int32 unchanged */
                    700:        /* |h7| <= 1.71*2^59 */
                    701: 
                    702:        carry3 = (h3 + (int64_t) (1L << 24)) >> 25;
                    703:        h4 += carry3;
                    704:        h3 -= carry3 * ((uint64_t) 1L << 25);
                    705:        /* |h3| <= 2^24; from now on fits into int32 unchanged */
                    706:        /* |h4| <= 1.72*2^34 */
                    707: 
                    708:        carry7 = (h7 + (int64_t) (1L << 24)) >> 25;
                    709:        h8 += carry7;
                    710:        h7 -= carry7 * ((uint64_t) 1L << 25);
                    711:        /* |h7| <= 2^24; from now on fits into int32 unchanged */
                    712:        /* |h8| <= 1.41*2^60 */
                    713: 
                    714:        carry4 = (h4 + (int64_t) (1L << 25)) >> 26;
                    715:        h5 += carry4;
                    716:        h4 -= carry4 * ((uint64_t) 1L << 26);
                    717:        /* |h4| <= 2^25; from now on fits into int32 unchanged */
                    718:        /* |h5| <= 1.01*2^24 */
                    719: 
                    720:        carry8 = (h8 + (int64_t) (1L << 25)) >> 26;
                    721:        h9 += carry8;
                    722:        h8 -= carry8 * ((uint64_t) 1L << 26);
                    723:        /* |h8| <= 2^25; from now on fits into int32 unchanged */
                    724:        /* |h9| <= 1.71*2^59 */
                    725: 
                    726:        carry9 = (h9 + (int64_t) (1L << 24)) >> 25;
                    727:        h0 += carry9 * 19;
                    728:        h9 -= carry9 * ((uint64_t) 1L << 25);
                    729:        /* |h9| <= 2^24; from now on fits into int32 unchanged */
                    730:        /* |h0| <= 1.1*2^39 */
                    731: 
                    732:        carry0 = (h0 + (int64_t) (1L << 25)) >> 26;
                    733:        h1 += carry0;
                    734:        h0 -= carry0 * ((uint64_t) 1L << 26);
                    735:        /* |h0| <= 2^25; from now on fits into int32 unchanged */
                    736:        /* |h1| <= 1.01*2^24 */
                    737: 
                    738:        h[0] = (int32_t) h0;
                    739:        h[1] = (int32_t) h1;
                    740:        h[2] = (int32_t) h2;
                    741:        h[3] = (int32_t) h3;
                    742:        h[4] = (int32_t) h4;
                    743:        h[5] = (int32_t) h5;
                    744:        h[6] = (int32_t) h6;
                    745:        h[7] = (int32_t) h7;
                    746:        h[8] = (int32_t) h8;
                    747:        h[9] = (int32_t) h9;
                    748: }
                    749: 
                    750: /**
                    751:  * h = -f
                    752:  *
                    753:  * Preconditions:
                    754:  * |f| bounded by 1.1*2^25, 1.1*2^24, 1.1*2^25, 1.1*2^24, etc.
                    755:  *
                    756:  * Postconditions:
                    757:  * |h| bounded by 1.1*2^25, 1.1*2^24, 1.1*2^25, 1.1*2^24, etc.
                    758:  */
                    759: static void fe_neg(fe h,const fe f)
                    760: {
                    761:        int32_t f0 = f[0];
                    762:        int32_t f1 = f[1];
                    763:        int32_t f2 = f[2];
                    764:        int32_t f3 = f[3];
                    765:        int32_t f4 = f[4];
                    766:        int32_t f5 = f[5];
                    767:        int32_t f6 = f[6];
                    768:        int32_t f7 = f[7];
                    769:        int32_t f8 = f[8];
                    770:        int32_t f9 = f[9];
                    771: 
                    772:        int32_t h0 = -f0;
                    773:        int32_t h1 = -f1;
                    774:        int32_t h2 = -f2;
                    775:        int32_t h3 = -f3;
                    776:        int32_t h4 = -f4;
                    777:        int32_t h5 = -f5;
                    778:        int32_t h6 = -f6;
                    779:        int32_t h7 = -f7;
                    780:        int32_t h8 = -f8;
                    781:        int32_t h9 = -f9;
                    782: 
                    783:        h[0] = h0;
                    784:        h[1] = h1;
                    785:        h[2] = h2;
                    786:        h[3] = h3;
                    787:        h[4] = h4;
                    788:        h[5] = h5;
                    789:        h[6] = h6;
                    790:        h[7] = h7;
                    791:        h[8] = h8;
                    792:        h[9] = h9;
                    793: }
                    794: 
                    795: /**
                    796:  * h = f * f
                    797:  * Can overlap h with f.
                    798:  *
                    799:  * Preconditions:
                    800:  * |f| bounded by 1.65*2^26, 1.65*2^25, 1.65*2^26, 1.65*2^25, etc.
                    801:  *
                    802:  * Postconditions:
                    803:  * |h| bounded by 1.01*2^25, 1.01*2^24, 1.01*2^25, 1.01*2^24, etc.
                    804:  *
                    805:  * See fe_mul.c for discussion of implementation strategy.
                    806:  */
                    807: static void fe_sq(fe h, const fe f)
                    808: {
                    809:        int32_t f0 = f[0];
                    810:        int32_t f1 = f[1];
                    811:        int32_t f2 = f[2];
                    812:        int32_t f3 = f[3];
                    813:        int32_t f4 = f[4];
                    814:        int32_t f5 = f[5];
                    815:        int32_t f6 = f[6];
                    816:        int32_t f7 = f[7];
                    817:        int32_t f8 = f[8];
                    818:        int32_t f9 = f[9];
                    819: 
                    820:        int32_t f0_2 = 2 * f0;
                    821:        int32_t f1_2 = 2 * f1;
                    822:        int32_t f2_2 = 2 * f2;
                    823:        int32_t f3_2 = 2 * f3;
                    824:        int32_t f4_2 = 2 * f4;
                    825:        int32_t f5_2 = 2 * f5;
                    826:        int32_t f6_2 = 2 * f6;
                    827:        int32_t f7_2 = 2 * f7;
                    828: 
                    829:        int32_t f5_38 = 38 * f5; /* 1.959375*2^30 */
                    830:        int32_t f6_19 = 19 * f6; /* 1.959375*2^30 */
                    831:        int32_t f7_38 = 38 * f7; /* 1.959375*2^30 */
                    832:        int32_t f8_19 = 19 * f8; /* 1.959375*2^30 */
                    833:        int32_t f9_38 = 38 * f9; /* 1.959375*2^30 */
                    834: 
                    835:        int64_t f0f0    = f0   * (int64_t) f0;
                    836:        int64_t f0f1_2  = f0_2 * (int64_t) f1;
                    837:        int64_t f0f2_2  = f0_2 * (int64_t) f2;
                    838:        int64_t f0f3_2  = f0_2 * (int64_t) f3;
                    839:        int64_t f0f4_2  = f0_2 * (int64_t) f4;
                    840:        int64_t f0f5_2  = f0_2 * (int64_t) f5;
                    841:        int64_t f0f6_2  = f0_2 * (int64_t) f6;
                    842:        int64_t f0f7_2  = f0_2 * (int64_t) f7;
                    843:        int64_t f0f8_2  = f0_2 * (int64_t) f8;
                    844:        int64_t f0f9_2  = f0_2 * (int64_t) f9;
                    845: 
                    846:        int64_t f1f1_2  = f1_2 * (int64_t) f1;
                    847:        int64_t f1f2_2  = f1_2 * (int64_t) f2;
                    848:        int64_t f1f3_4  = f1_2 * (int64_t) f3_2;
                    849:        int64_t f1f4_2  = f1_2 * (int64_t) f4;
                    850:        int64_t f1f5_4  = f1_2 * (int64_t) f5_2;
                    851:        int64_t f1f6_2  = f1_2 * (int64_t) f6;
                    852:        int64_t f1f7_4  = f1_2 * (int64_t) f7_2;
                    853:        int64_t f1f8_2  = f1_2 * (int64_t) f8;
                    854:        int64_t f1f9_76 = f1_2 * (int64_t) f9_38;
                    855: 
                    856:        int64_t f2f2    = f2   * (int64_t) f2;
                    857:        int64_t f2f3_2  = f2_2 * (int64_t) f3;
                    858:        int64_t f2f4_2  = f2_2 * (int64_t) f4;
                    859:        int64_t f2f5_2  = f2_2 * (int64_t) f5;
                    860:        int64_t f2f6_2  = f2_2 * (int64_t) f6;
                    861:        int64_t f2f7_2  = f2_2 * (int64_t) f7;
                    862:        int64_t f2f8_38 = f2_2 * (int64_t) f8_19;
                    863:        int64_t f2f9_38 = f2   * (int64_t) f9_38;
                    864: 
                    865:        int64_t f3f3_2  = f3_2 * (int64_t) f3;
                    866:        int64_t f3f4_2  = f3_2 * (int64_t) f4;
                    867:        int64_t f3f5_4  = f3_2 * (int64_t) f5_2;
                    868:        int64_t f3f6_2  = f3_2 * (int64_t) f6;
                    869:        int64_t f3f7_76 = f3_2 * (int64_t) f7_38;
                    870:        int64_t f3f8_38 = f3_2 * (int64_t) f8_19;
                    871:        int64_t f3f9_76 = f3_2 * (int64_t) f9_38;
                    872: 
                    873:        int64_t f4f4    = f4   * (int64_t) f4;
                    874:        int64_t f4f5_2  = f4_2 * (int64_t) f5;
                    875:        int64_t f4f6_38 = f4_2 * (int64_t) f6_19;
                    876:        int64_t f4f7_38 = f4   * (int64_t) f7_38;
                    877:        int64_t f4f8_38 = f4_2 * (int64_t) f8_19;
                    878:        int64_t f4f9_38 = f4   * (int64_t) f9_38;
                    879: 
                    880:        int64_t f5f5_38 = f5   * (int64_t) f5_38;
                    881:        int64_t f5f6_38 = f5_2 * (int64_t) f6_19;
                    882:        int64_t f5f7_76 = f5_2 * (int64_t) f7_38;
                    883:        int64_t f5f8_38 = f5_2 * (int64_t) f8_19;
                    884:        int64_t f5f9_76 = f5_2 * (int64_t) f9_38;
                    885: 
                    886:        int64_t f6f6_19 = f6   * (int64_t) f6_19;
                    887:        int64_t f6f7_38 = f6   * (int64_t) f7_38;
                    888:        int64_t f6f8_38 = f6_2 * (int64_t) f8_19;
                    889:        int64_t f6f9_38 = f6   * (int64_t) f9_38;
                    890: 
                    891:        int64_t f7f7_38 = f7   * (int64_t) f7_38;
                    892:        int64_t f7f8_38 = f7_2 * (int64_t) f8_19;
                    893:        int64_t f7f9_76 = f7_2 * (int64_t) f9_38;
                    894: 
                    895:        int64_t f8f8_19 = f8   * (int64_t) f8_19;
                    896:        int64_t f8f9_38 = f8   * (int64_t) f9_38;
                    897: 
                    898:        int64_t f9f9_38 = f9   * (int64_t) f9_38;
                    899: 
                    900:        int64_t h0 = f0f0   + f1f9_76 + f2f8_38 + f3f7_76 + f4f6_38 + f5f5_38;
                    901:        int64_t h1 = f0f1_2 + f2f9_38 + f3f8_38 + f4f7_38 + f5f6_38;
                    902:        int64_t h2 = f0f2_2 + f1f1_2  + f3f9_76 + f4f8_38 + f5f7_76 + f6f6_19;
                    903:        int64_t h3 = f0f3_2 + f1f2_2  + f4f9_38 + f5f8_38 + f6f7_38;
                    904:        int64_t h4 = f0f4_2 + f1f3_4  + f2f2    + f5f9_76 + f6f8_38 + f7f7_38;
                    905:        int64_t h5 = f0f5_2 + f1f4_2  + f2f3_2  + f6f9_38 + f7f8_38;
                    906:        int64_t h6 = f0f6_2 + f1f5_4  + f2f4_2  + f3f3_2  + f7f9_76 + f8f8_19;
                    907:        int64_t h7 = f0f7_2 + f1f6_2  + f2f5_2  + f3f4_2  + f8f9_38;
                    908:        int64_t h8 = f0f8_2 + f1f7_4  + f2f6_2  + f3f5_4  + f4f4    + f9f9_38;
                    909:        int64_t h9 = f0f9_2 + f1f8_2  + f2f7_2  + f3f6_2  + f4f5_2;
                    910: 
                    911:        int64_t carry0, carry1, carry2, carry3, carry4;
                    912:        int64_t carry5, carry6, carry7, carry8, carry9;
                    913: 
                    914:        carry0 = (h0 + (int64_t) (1L << 25)) >> 26;
                    915:        h1 += carry0;
                    916:        h0 -= carry0 * ((uint64_t) 1L << 26);
                    917: 
                    918:        carry4 = (h4 + (int64_t) (1L << 25)) >> 26;
                    919:        h5 += carry4;
                    920:        h4 -= carry4 * ((uint64_t) 1L << 26);
                    921: 
                    922:        carry1 = (h1 + (int64_t) (1L << 24)) >> 25;
                    923:        h2 += carry1;
                    924:        h1 -= carry1 * ((uint64_t) 1L << 25);
                    925: 
                    926:        carry5 = (h5 + (int64_t) (1L << 24)) >> 25;
                    927:        h6 += carry5;
                    928:        h5 -= carry5 * ((uint64_t) 1L << 25);
                    929: 
                    930:        carry2 = (h2 + (int64_t) (1L << 25)) >> 26;
                    931:        h3 += carry2;
                    932:        h2 -= carry2 * ((uint64_t) 1L << 26);
                    933: 
                    934:        carry6 = (h6 + (int64_t) (1L << 25)) >> 26;
                    935:        h7 += carry6;
                    936:        h6 -= carry6 * ((uint64_t) 1L << 26);
                    937: 
                    938:        carry3 = (h3 + (int64_t) (1L << 24)) >> 25;
                    939:        h4 += carry3;
                    940:        h3 -= carry3 * ((uint64_t) 1L << 25);
                    941: 
                    942:        carry7 = (h7 + (int64_t) (1L << 24)) >> 25;
                    943:        h8 += carry7;
                    944:        h7 -= carry7 * ((uint64_t) 1L << 25);
                    945: 
                    946:        carry4 = (h4 + (int64_t) (1L << 25)) >> 26;
                    947:        h5 += carry4;
                    948:        h4 -= carry4 * ((uint64_t) 1L << 26);
                    949: 
                    950:        carry8 = (h8 + (int64_t) (1L << 25)) >> 26;
                    951:        h9 += carry8;
                    952:        h8 -= carry8 * ((uint64_t) 1L << 26);
                    953: 
                    954:        carry9 = (h9 + (int64_t) (1L << 24)) >> 25;
                    955:        h0 += carry9 * 19;
                    956:        h9 -= carry9 * ((uint64_t) 1L << 25);
                    957: 
                    958:        carry0 = (h0 + (int64_t) (1L << 25)) >> 26;
                    959:        h1 += carry0;
                    960:        h0 -= carry0 * ((uint64_t) 1L << 26);
                    961: 
                    962:        h[0] = (int32_t) h0;
                    963:        h[1] = (int32_t) h1;
                    964:        h[2] = (int32_t) h2;
                    965:        h[3] = (int32_t) h3;
                    966:        h[4] = (int32_t) h4;
                    967:        h[5] = (int32_t) h5;
                    968:        h[6] = (int32_t) h6;
                    969:        h[7] = (int32_t) h7;
                    970:        h[8] = (int32_t) h8;
                    971:        h[9] = (int32_t) h9;
                    972: }
                    973: 
                    974: /**
                    975:  * h = 2 * f * f
                    976:  * Can overlap h with f.
                    977:  *
                    978:  * Preconditions:
                    979:  *|f| bounded by 1.65*2^26, 1.65*2^25, 1.65*2^26, 1.65*2^25, etc.
                    980:  *
                    981:  * Postconditions:
                    982:  * |h| bounded by 1.01*2^25, 1.01*2^24, 1.01*2^25, 1.01*2^24, etc.
                    983:  *
                    984:  * See fe_mul.c for discussion of implementation strategy.
                    985:  */
                    986: static void fe_sq2(fe h, const fe f)
                    987: {
                    988:        int32_t f0 = f[0];
                    989:        int32_t f1 = f[1];
                    990:        int32_t f2 = f[2];
                    991:        int32_t f3 = f[3];
                    992:        int32_t f4 = f[4];
                    993:        int32_t f5 = f[5];
                    994:        int32_t f6 = f[6];
                    995:        int32_t f7 = f[7];
                    996:        int32_t f8 = f[8];
                    997:        int32_t f9 = f[9];
                    998: 
                    999:        int32_t f0_2 = 2 * f0;
                   1000:        int32_t f1_2 = 2 * f1;
                   1001:        int32_t f2_2 = 2 * f2;
                   1002:        int32_t f3_2 = 2 * f3;
                   1003:        int32_t f4_2 = 2 * f4;
                   1004:        int32_t f5_2 = 2 * f5;
                   1005:        int32_t f6_2 = 2 * f6;
                   1006:        int32_t f7_2 = 2 * f7;
                   1007: 
                   1008:        int32_t f5_38 = 38 * f5; /* 1.959375*2^30 */
                   1009:        int32_t f6_19 = 19 * f6; /* 1.959375*2^30 */
                   1010:        int32_t f7_38 = 38 * f7; /* 1.959375*2^30 */
                   1011:        int32_t f8_19 = 19 * f8; /* 1.959375*2^30 */
                   1012:        int32_t f9_38 = 38 * f9; /* 1.959375*2^30 */
                   1013: 
                   1014:        int64_t f0f0    = f0   * (int64_t) f0;
                   1015:        int64_t f0f1_2  = f0_2 * (int64_t) f1;
                   1016:        int64_t f0f2_2  = f0_2 * (int64_t) f2;
                   1017:        int64_t f0f3_2  = f0_2 * (int64_t) f3;
                   1018:        int64_t f0f4_2  = f0_2 * (int64_t) f4;
                   1019:        int64_t f0f5_2  = f0_2 * (int64_t) f5;
                   1020:        int64_t f0f6_2  = f0_2 * (int64_t) f6;
                   1021:        int64_t f0f7_2  = f0_2 * (int64_t) f7;
                   1022:        int64_t f0f8_2  = f0_2 * (int64_t) f8;
                   1023:        int64_t f0f9_2  = f0_2 * (int64_t) f9;
                   1024: 
                   1025:        int64_t f1f1_2  = f1_2 * (int64_t) f1;
                   1026:        int64_t f1f2_2  = f1_2 * (int64_t) f2;
                   1027:        int64_t f1f3_4  = f1_2 * (int64_t) f3_2;
                   1028:        int64_t f1f4_2  = f1_2 * (int64_t) f4;
                   1029:        int64_t f1f5_4  = f1_2 * (int64_t) f5_2;
                   1030:        int64_t f1f6_2  = f1_2 * (int64_t) f6;
                   1031:        int64_t f1f7_4  = f1_2 * (int64_t) f7_2;
                   1032:        int64_t f1f8_2  = f1_2 * (int64_t) f8;
                   1033:        int64_t f1f9_76 = f1_2 * (int64_t) f9_38;
                   1034: 
                   1035:        int64_t f2f2    = f2   * (int64_t) f2;
                   1036:        int64_t f2f3_2  = f2_2 * (int64_t) f3;
                   1037:        int64_t f2f4_2  = f2_2 * (int64_t) f4;
                   1038:        int64_t f2f5_2  = f2_2 * (int64_t) f5;
                   1039:        int64_t f2f6_2  = f2_2 * (int64_t) f6;
                   1040:        int64_t f2f7_2  = f2_2 * (int64_t) f7;
                   1041:        int64_t f2f8_38 = f2_2 * (int64_t) f8_19;
                   1042:        int64_t f2f9_38 = f2   * (int64_t) f9_38;
                   1043: 
                   1044:        int64_t f3f3_2  = f3_2 * (int64_t) f3;
                   1045:        int64_t f3f4_2  = f3_2 * (int64_t) f4;
                   1046:        int64_t f3f5_4  = f3_2 * (int64_t) f5_2;
                   1047:        int64_t f3f6_2  = f3_2 * (int64_t) f6;
                   1048:        int64_t f3f7_76 = f3_2 * (int64_t) f7_38;
                   1049:        int64_t f3f8_38 = f3_2 * (int64_t) f8_19;
                   1050:        int64_t f3f9_76 = f3_2 * (int64_t) f9_38;
                   1051: 
                   1052:        int64_t f4f4    = f4   * (int64_t) f4;
                   1053:        int64_t f4f5_2  = f4_2 * (int64_t) f5;
                   1054:        int64_t f4f6_38 = f4_2 * (int64_t) f6_19;
                   1055:        int64_t f4f7_38 = f4   * (int64_t) f7_38;
                   1056:        int64_t f4f8_38 = f4_2 * (int64_t) f8_19;
                   1057:        int64_t f4f9_38 = f4   * (int64_t) f9_38;
                   1058: 
                   1059:        int64_t f5f5_38 = f5   * (int64_t) f5_38;
                   1060:        int64_t f5f6_38 = f5_2 * (int64_t) f6_19;
                   1061:        int64_t f5f7_76 = f5_2 * (int64_t) f7_38;
                   1062:        int64_t f5f8_38 = f5_2 * (int64_t) f8_19;
                   1063:        int64_t f5f9_76 = f5_2 * (int64_t) f9_38;
                   1064: 
                   1065:        int64_t f6f6_19 = f6   * (int64_t) f6_19;
                   1066:        int64_t f6f7_38 = f6   * (int64_t) f7_38;
                   1067:        int64_t f6f8_38 = f6_2 * (int64_t) f8_19;
                   1068:        int64_t f6f9_38 = f6   * (int64_t) f9_38;
                   1069: 
                   1070:        int64_t f7f7_38 = f7   * (int64_t) f7_38;
                   1071:        int64_t f7f8_38 = f7_2 * (int64_t) f8_19;
                   1072:        int64_t f7f9_76 = f7_2 * (int64_t) f9_38;
                   1073: 
                   1074:        int64_t f8f8_19 = f8   * (int64_t) f8_19;
                   1075:        int64_t f8f9_38 = f8   * (int64_t) f9_38;
                   1076: 
                   1077:        int64_t f9f9_38 = f9   * (int64_t) f9_38;
                   1078: 
                   1079:        int64_t h0 = f0f0   + f1f9_76 + f2f8_38 + f3f7_76 + f4f6_38 + f5f5_38;
                   1080:        int64_t h1 = f0f1_2 + f2f9_38 + f3f8_38 + f4f7_38 + f5f6_38;
                   1081:        int64_t h2 = f0f2_2 + f1f1_2  + f3f9_76 + f4f8_38 + f5f7_76 + f6f6_19;
                   1082:        int64_t h3 = f0f3_2 + f1f2_2  + f4f9_38 + f5f8_38 + f6f7_38;
                   1083:        int64_t h4 = f0f4_2 + f1f3_4  + f2f2    + f5f9_76 + f6f8_38 + f7f7_38;
                   1084:        int64_t h5 = f0f5_2 + f1f4_2  + f2f3_2  + f6f9_38 + f7f8_38;
                   1085:        int64_t h6 = f0f6_2 + f1f5_4  + f2f4_2  + f3f3_2  + f7f9_76 + f8f8_19;
                   1086:        int64_t h7 = f0f7_2 + f1f6_2  + f2f5_2  + f3f4_2  + f8f9_38;
                   1087:        int64_t h8 = f0f8_2 + f1f7_4  + f2f6_2  + f3f5_4  + f4f4    + f9f9_38;
                   1088:        int64_t h9 = f0f9_2 + f1f8_2  + f2f7_2  + f3f6_2  + f4f5_2;
                   1089: 
                   1090:        int64_t carry0, carry1, carry2, carry3, carry4;
                   1091:        int64_t carry5, carry6, carry7, carry8, carry9;
                   1092: 
                   1093:        h0 += h0;
                   1094:        h1 += h1;
                   1095:        h2 += h2;
                   1096:        h3 += h3;
                   1097:        h4 += h4;
                   1098:        h5 += h5;
                   1099:        h6 += h6;
                   1100:        h7 += h7;
                   1101:        h8 += h8;
                   1102:        h9 += h9;
                   1103: 
                   1104:        carry0 = (h0 + (int64_t) (1L << 25)) >> 26;
                   1105:        h1 += carry0;
                   1106:        h0 -= carry0 * ((uint64_t) 1L << 26);
                   1107: 
                   1108:        carry4 = (h4 + (int64_t) (1L << 25)) >> 26;
                   1109:        h5 += carry4;
                   1110:        h4 -= carry4 * ((uint64_t) 1L << 26);
                   1111: 
                   1112:        carry1 = (h1 + (int64_t) (1L << 24)) >> 25;
                   1113:        h2 += carry1;
                   1114:        h1 -= carry1 * ((uint64_t) 1L << 25);
                   1115: 
                   1116:        carry5 = (h5 + (int64_t) (1L << 24)) >> 25;
                   1117:        h6 += carry5;
                   1118:        h5 -= carry5 * ((uint64_t) 1L << 25);
                   1119: 
                   1120:        carry2 = (h2 + (int64_t) (1L << 25)) >> 26;
                   1121:        h3 += carry2;
                   1122:        h2 -= carry2 * ((uint64_t) 1L << 26);
                   1123: 
                   1124:        carry6 = (h6 + (int64_t) (1L << 25)) >> 26;
                   1125:        h7 += carry6;
                   1126:        h6 -= carry6 * ((uint64_t) 1L << 26);
                   1127: 
                   1128:        carry3 = (h3 + (int64_t) (1L << 24)) >> 25;
                   1129:        h4 += carry3;
                   1130:        h3 -= carry3 * ((uint64_t) 1L << 25);
                   1131: 
                   1132:        carry7 = (h7 + (int64_t) (1L << 24)) >> 25;
                   1133:        h8 += carry7;
                   1134:        h7 -= carry7 * ((uint64_t) 1L << 25);
                   1135: 
                   1136:        carry4 = (h4 + (int64_t) (1L << 25)) >> 26;
                   1137:        h5 += carry4;
                   1138:        h4 -= carry4 * ((uint64_t) 1L << 26);
                   1139: 
                   1140:        carry8 = (h8 + (int64_t) (1L << 25)) >> 26;
                   1141:        h9 += carry8;
                   1142:        h8 -= carry8 * ((uint64_t) 1L << 26);
                   1143: 
                   1144:        carry9 = (h9 + (int64_t) (1L << 24)) >> 25;
                   1145:        h0 += carry9 * 19;
                   1146:        h9 -= carry9 * ((uint64_t) 1L << 25);
                   1147: 
                   1148:        carry0 = (h0 + (int64_t) (1L << 25)) >> 26;
                   1149:        h1 += carry0;
                   1150:        h0 -= carry0 * ((uint64_t) 1L << 26);
                   1151: 
                   1152:        h[0] = (int32_t) h0;
                   1153:        h[1] = (int32_t) h1;
                   1154:        h[2] = (int32_t) h2;
                   1155:        h[3] = (int32_t) h3;
                   1156:        h[4] = (int32_t) h4;
                   1157:        h[5] = (int32_t) h5;
                   1158:        h[6] = (int32_t) h6;
                   1159:        h[7] = (int32_t) h7;
                   1160:        h[8] = (int32_t) h8;
                   1161:        h[9] = (int32_t) h9;
                   1162: }
                   1163: 
                   1164: static void fe_invert(fe out, const fe z)
                   1165: {
                   1166:        fe t0, t1, t2, t3;
                   1167:        int i;
                   1168: 
                   1169:        fe_sq(t0, z);
                   1170:        fe_sq(t1, t0);
                   1171:        fe_sq(t1, t1);
                   1172:        fe_mul(t1, z, t1);
                   1173:        fe_mul(t0, t0, t1);
                   1174:        fe_sq(t2, t0);
                   1175:        fe_mul(t1, t1, t2);
                   1176:        fe_sq(t2, t1);
                   1177: 
                   1178:        for (i = 1; i < 5; ++i)
                   1179:        {
                   1180:                fe_sq(t2, t2);
                   1181:        }
                   1182: 
                   1183:        fe_mul(t1, t2, t1);
                   1184:        fe_sq(t2, t1);
                   1185: 
                   1186:        for (i = 1; i < 10; ++i)
                   1187:        {
                   1188:                fe_sq(t2, t2);
                   1189:        }
                   1190: 
                   1191:        fe_mul(t2, t2, t1);
                   1192:        fe_sq(t3, t2);
                   1193: 
                   1194:        for (i = 1; i < 20; ++i)
                   1195:        {
                   1196:                fe_sq(t3, t3);
                   1197:        }
                   1198: 
                   1199:        fe_mul(t2, t3, t2);
                   1200:        fe_sq(t2, t2);
                   1201: 
                   1202:        for (i = 1; i < 10; ++i)
                   1203:        {
                   1204:                fe_sq(t2, t2);
                   1205:        }
                   1206: 
                   1207:        fe_mul(t1, t2, t1);
                   1208:        fe_sq(t2, t1);
                   1209: 
                   1210:        for (i = 1; i < 50; ++i)
                   1211:        {
                   1212:                fe_sq(t2, t2);
                   1213:        }
                   1214: 
                   1215:        fe_mul(t2, t2, t1);
                   1216:        fe_sq(t3, t2);
                   1217: 
                   1218:        for (i = 1; i < 100; ++i)
                   1219:        {
                   1220:                fe_sq(t3, t3);
                   1221:        }
                   1222: 
                   1223:        fe_mul(t2, t3, t2);
                   1224:        fe_sq(t2, t2);
                   1225: 
                   1226:        for (i = 1; i < 50; ++i)
                   1227:        {
                   1228:                fe_sq(t2, t2);
                   1229:        }
                   1230: 
                   1231:        fe_mul(t1, t2, t1);
                   1232:        fe_sq(t1, t1);
                   1233: 
                   1234:        for (i = 1; i < 5; ++i)
                   1235:        {
                   1236:                fe_sq(t1, t1);
                   1237:        }
                   1238: 
                   1239:        fe_mul(out, t1, t0);
                   1240: }
                   1241: 
                   1242: static void fe_pow22523(fe out, const fe z)
                   1243: {
                   1244:        fe t0, t1, t2;
                   1245:        int i;
                   1246: 
                   1247:        fe_sq(t0, z);
                   1248:        fe_sq(t1, t0);
                   1249:        fe_sq(t1, t1);
                   1250:        fe_mul(t1, z, t1);
                   1251:        fe_mul(t0, t0, t1);
                   1252:        fe_sq(t0, t0);
                   1253:        fe_mul(t0, t1, t0);
                   1254:        fe_sq(t1, t0);
                   1255: 
                   1256:        for (i = 1; i < 5; ++i)
                   1257:        {
                   1258:                fe_sq(t1, t1);
                   1259:        }
                   1260: 
                   1261:        fe_mul(t0, t1, t0);
                   1262:        fe_sq(t1, t0);
                   1263: 
                   1264:        for (i = 1; i < 10; ++i)
                   1265:        {
                   1266:                fe_sq(t1, t1);
                   1267:        }
                   1268: 
                   1269:        fe_mul(t1, t1, t0);
                   1270:        fe_sq(t2, t1);
                   1271: 
                   1272:        for (i = 1; i < 20; ++i)
                   1273:        {
                   1274:                fe_sq(t2, t2);
                   1275:        }
                   1276: 
                   1277:        fe_mul(t1, t2, t1);
                   1278:        fe_sq(t1, t1);
                   1279: 
                   1280:        for (i = 1; i < 10; ++i)
                   1281:        {
                   1282:                fe_sq(t1, t1);
                   1283:        }
                   1284: 
                   1285:        fe_mul(t0, t1, t0);
                   1286:        fe_sq(t1, t0);
                   1287: 
                   1288:        for (i = 1; i < 50; ++i)
                   1289:        {
                   1290:                fe_sq(t1, t1);
                   1291:        }
                   1292: 
                   1293:        fe_mul(t1, t1, t0);
                   1294:        fe_sq(t2, t1);
                   1295:        for (i = 1; i < 100; ++i)
                   1296:        {
                   1297:                fe_sq(t2, t2);
                   1298:        }
                   1299: 
                   1300:        fe_mul(t1, t2, t1);
                   1301:        fe_sq(t1, t1);
                   1302: 
                   1303:        for (i = 1; i < 50; ++i)
                   1304:        {
                   1305:                fe_sq(t1, t1);
                   1306:        }
                   1307: 
                   1308:        fe_mul(t0, t1, t0);
                   1309:        fe_sq(t0, t0);
                   1310:        fe_sq(t0, t0);
                   1311:        fe_mul(out, t0, z);
                   1312: }
                   1313: 
                   1314: /**
                   1315:  * h = f - g
                   1316:  * Can overlap h with f or g.
                   1317:  *
                   1318:  * Preconditions:
                   1319:  * |f| bounded by 1.1*2^25, 1.1*2^24, 1.1*2^25, 1.1*2^24, etc.
                   1320:  * |g| bounded by 1.1*2^25, 1.1*2^24, 1.1*2^25, 1.1*2^24, etc.
                   1321:  *
                   1322:  * Postconditions:
                   1323:  * |h| bounded by 1.1*2^26, 1.1*2^25, 1.1*2^26, 1.1*2^25, etc.
                   1324:  */
                   1325: static void fe_sub(fe h, const fe f, const fe g)
                   1326: {
                   1327:        int32_t f0 = f[0];
                   1328:        int32_t f1 = f[1];
                   1329:        int32_t f2 = f[2];
                   1330:        int32_t f3 = f[3];
                   1331:        int32_t f4 = f[4];
                   1332:        int32_t f5 = f[5];
                   1333:        int32_t f6 = f[6];
                   1334:        int32_t f7 = f[7];
                   1335:        int32_t f8 = f[8];
                   1336:        int32_t f9 = f[9];
                   1337: 
                   1338:        int32_t g0 = g[0];
                   1339:        int32_t g1 = g[1];
                   1340:        int32_t g2 = g[2];
                   1341:        int32_t g3 = g[3];
                   1342:        int32_t g4 = g[4];
                   1343:        int32_t g5 = g[5];
                   1344:        int32_t g6 = g[6];
                   1345:        int32_t g7 = g[7];
                   1346:        int32_t g8 = g[8];
                   1347:        int32_t g9 = g[9];
                   1348: 
                   1349:        int32_t h0 = f0 - g0;
                   1350:        int32_t h1 = f1 - g1;
                   1351:        int32_t h2 = f2 - g2;
                   1352:        int32_t h3 = f3 - g3;
                   1353:        int32_t h4 = f4 - g4;
                   1354:        int32_t h5 = f5 - g5;
                   1355:        int32_t h6 = f6 - g6;
                   1356:        int32_t h7 = f7 - g7;
                   1357:        int32_t h8 = f8 - g8;
                   1358:        int32_t h9 = f9 - g9;
                   1359: 
                   1360:        h[0] = h0;
                   1361:        h[1] = h1;
                   1362:        h[2] = h2;
                   1363:        h[3] = h3;
                   1364:        h[4] = h4;
                   1365:        h[5] = h5;
                   1366:        h[6] = h6;
                   1367:        h[7] = h7;
                   1368:        h[8] = h8;
                   1369:        h[9] = h9;
                   1370: }
                   1371: 
                   1372: /**
                   1373:  * r = p + q
                   1374:  */
                   1375: static void ge_add(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q)
                   1376: {
                   1377:        fe t0;
                   1378: 
                   1379:        fe_add(r->X, p->Y, p->X);
                   1380:        fe_sub(r->Y, p->Y, p->X);
                   1381:        fe_mul(r->Z, r->X, q->YplusX);
                   1382:        fe_mul(r->Y, r->Y, q->YminusX);
                   1383:        fe_mul(r->T, q->T2d, p->T);
                   1384:        fe_mul(r->X, p->Z, q->Z);
                   1385:        fe_add(t0, r->X, r->X);
                   1386:        fe_sub(r->X, r->Z, r->Y);
                   1387:        fe_add(r->Y, r->Z, r->Y);
                   1388:        fe_add(r->Z, t0, r->T);
                   1389:        fe_sub(r->T, t0, r->T);
                   1390: }
                   1391: 
                   1392: static void slide(int8_t *r, const uint8_t *a)
                   1393: {
                   1394:        int i, b, k;
                   1395: 
                   1396:        for (i = 0; i < 256; ++i)
                   1397:        {
                   1398:                r[i] = 1 & (a[i >> 3] >> (i & 7));
                   1399:        }
                   1400: 
                   1401:        for (i = 0; i < 256; ++i)
                   1402:        {
                   1403:                if (r[i])
                   1404:                {
                   1405:                        for (b = 1; b <= 6 && i + b < 256; ++b)
                   1406:                        {
                   1407:                                if (r[i + b])
                   1408:                                {
                   1409:                                        if (r[i] + (r[i + b] << b) <= 15)
                   1410:                                        {
                   1411:                                                r[i] += r[i + b] << b; r[i + b] = 0;
                   1412:                                        }
                   1413:                                        else if (r[i] - (r[i + b] << b) >= -15)
                   1414:                                        {
                   1415:                                                r[i] -= r[i + b] << b;
                   1416: 
                   1417:                                                for (k = i + b; k < 256; ++k)
                   1418:                                                {
                   1419:                                                        if (!r[k])
                   1420:                                                        {
                   1421:                                                                r[k] = 1;
                   1422:                                                                break;
                   1423:                                                        }
                   1424:                                                        r[k] = 0;
                   1425:                                                }
                   1426:                                        }
                   1427:                                        else
                   1428:                                        {
                   1429:                                                break;
                   1430:                                        }
                   1431:                                }
                   1432:                        }
                   1433:                }
                   1434:        }
                   1435: }
                   1436: 
                   1437: static const ge_precomp Bi[8] = {
                   1438: #include "base2.h"
                   1439: };
                   1440: 
                   1441: /* 37095705934669439343138083508754565189542113879843219016388785533085940283555 */
                   1442: static const fe d = {
                   1443:        -10913610,  13857413, -15372611,   6949391,    114729,
                   1444:         -8787816,  -6275908,  -3247719, -18696448, -12055116
                   1445: };
                   1446: 
                   1447: /* sqrt(-1) */
                   1448: static const fe sqrtm1 = {
                   1449:        -32595792,  -7943725,  9377950,    3500415,  12389472,
                   1450:          -272473, -25146209, -2005654,     326686,  11406482
                   1451: };
                   1452: 
                   1453: int ge_frombytes_negate_vartime(ge_p3 *h, const uint8_t *s)
                   1454: {
                   1455:        fe u, v, v3, vxx, check;
                   1456: 
                   1457:        fe_frombytes(h->Y,s);
                   1458:        fe_1(h->Z);
                   1459:        fe_sq(u,h->Y);
                   1460:        fe_mul(v,u,d);
                   1461:        fe_sub(u,u,h->Z);          /* u = y^2-1 */
                   1462:        fe_add(v,v,h->Z);          /* v = dy^2+1 */
                   1463: 
                   1464:        fe_sq(v3,v);
                   1465:        fe_mul(v3,v3,v);                /* v3 = v^3 */
                   1466:        fe_sq(h->X,v3);
                   1467:        fe_mul(h->X,h->X,v);
                   1468:        fe_mul(h->X,h->X,u);    /* x = uv^7 */
                   1469: 
                   1470:        fe_pow22523(h->X,h->X); /* x = (uv^7)^((q-5)/8) */
                   1471:        fe_mul(h->X,h->X,v3);
                   1472:        fe_mul(h->X,h->X,u);    /* x = uv^3(uv^7)^((q-5)/8) */
                   1473: 
                   1474:        fe_sq(vxx,h->X);
                   1475:        fe_mul(vxx,vxx,v);
                   1476:        fe_sub(check,vxx,u);    /* vx^2-u */
                   1477: 
                   1478:        if (fe_isnonzero(check))
                   1479:        {
                   1480:                fe_add(check,vxx,u);  /* vx^2+u */
                   1481: 
                   1482:                if (fe_isnonzero(check))
                   1483:                {
                   1484:                        return -1;
                   1485:                }
                   1486:                fe_mul(h->X,h->X,sqrtm1);
                   1487:        }
                   1488: 
                   1489:        if (fe_isnegative(h->X) == (s[31] >> 7))
                   1490:        {
                   1491:                fe_neg(h->X,h->X);
                   1492:        }
                   1493:        fe_mul(h->T,h->X,h->Y);
                   1494: 
                   1495:        return 0;
                   1496: }
                   1497: 
                   1498: /**
                   1499:  * r = p + q
                   1500:  */
                   1501: static void ge_madd(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q)
                   1502: {
                   1503:        fe t0;
                   1504: 
                   1505:        fe_add(r->X, p->Y, p->X);
                   1506:        fe_sub(r->Y, p->Y, p->X);
                   1507:        fe_mul(r->Z, r->X, q->yplusx);
                   1508:        fe_mul(r->Y, r->Y, q->yminusx);
                   1509:        fe_mul(r->T, q->xy2d, p->T);
                   1510:        fe_add(t0, p->Z, p->Z);
                   1511:        fe_sub(r->X, r->Z, r->Y);
                   1512:        fe_add(r->Y, r->Z, r->Y);
                   1513:        fe_add(r->Z, t0, r->T);
                   1514:        fe_sub(r->T, t0, r->T);
                   1515: }
                   1516: 
                   1517: /**
                   1518:  * r = p - q
                   1519:  */
                   1520: static void ge_msub(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q)
                   1521: {
                   1522:        fe t0;
                   1523: 
                   1524:        fe_add(r->X, p->Y, p->X);
                   1525:        fe_sub(r->Y, p->Y, p->X);
                   1526:        fe_mul(r->Z, r->X, q->yminusx);
                   1527:        fe_mul(r->Y, r->Y, q->yplusx);
                   1528:        fe_mul(r->T, q->xy2d, p->T);
                   1529:        fe_add(t0, p->Z, p->Z);
                   1530:        fe_sub(r->X, r->Z, r->Y);
                   1531:        fe_add(r->Y, r->Z, r->Y);
                   1532:        fe_sub(r->Z, t0, r->T);
                   1533:        fe_add(r->T, t0, r->T);
                   1534: }
                   1535: 
                   1536: /**
                   1537:  * r = p
                   1538:  */
                   1539: static void ge_p1p1_to_p2(ge_p2 *r, const ge_p1p1 *p)
                   1540: {
                   1541:        fe_mul(r->X,p->X,p->T);
                   1542:        fe_mul(r->Y,p->Y,p->Z);
                   1543:        fe_mul(r->Z,p->Z,p->T);
                   1544: }
                   1545: 
                   1546: /**
                   1547:  * r = p
                   1548:  */
                   1549: static void ge_p1p1_to_p3(ge_p3 *r, const ge_p1p1 *p)
                   1550: {
                   1551:        fe_mul(r->X,p->X,p->T);
                   1552:        fe_mul(r->Y,p->Y,p->Z);
                   1553:        fe_mul(r->Z,p->Z,p->T);
                   1554:        fe_mul(r->T,p->X,p->Y);
                   1555: }
                   1556: 
                   1557: static void ge_p2_0(ge_p2 *h)
                   1558: {
                   1559:        fe_0(h->X);
                   1560:        fe_1(h->Y);
                   1561:        fe_1(h->Z);
                   1562: }
                   1563: 
                   1564: /**
                   1565:  * r = 2 * p
                   1566:  */
                   1567: static void ge_p2_dbl(ge_p1p1 *r, const ge_p2 *p)
                   1568: {
                   1569:        fe t0;
                   1570: 
                   1571:        fe_sq(r->X, p->X);
                   1572:        fe_sq(r->Z, p->Y);
                   1573:        fe_sq2(r->T, p->Z);
                   1574:        fe_add(r->Y, p->X, p->Y);
                   1575:        fe_sq(t0, r->Y);
                   1576:        fe_add(r->Y, r->Z, r->X);
                   1577:        fe_sub(r->Z, r->Z, r->X);
                   1578:        fe_sub(r->X, t0, r->Y);
                   1579:        fe_sub(r->T, r->T, r->Z);
                   1580: }
                   1581: 
                   1582: static void ge_p3_0(ge_p3 *h)
                   1583: {
                   1584:        fe_0(h->X);
                   1585:        fe_1(h->Y);
                   1586:        fe_1(h->Z);
                   1587:        fe_0(h->T);
                   1588: }
                   1589: 
                   1590: /**
                   1591:  * r = p
                   1592:  */
                   1593: 
                   1594: /* 2 * d = 16295367250680780974490674513165176452449235426866156013048779062215315747161 */
                   1595: static const fe d2 = {
                   1596:        -21827239,  -5839606, -30745221, 13898782,  229458,
                   1597:         15978800, -12551817,  -6495438, 29715968, 9444199
                   1598: };
                   1599: 
                   1600: static void ge_p3_to_cached(ge_cached *r, const ge_p3 *p)
                   1601: {
                   1602:        fe_add(r->YplusX,p->Y,p->X);
                   1603:        fe_sub(r->YminusX,p->Y,p->X);
                   1604:        fe_copy(r->Z,p->Z);
                   1605:        fe_mul(r->T2d,p->T,d2);
                   1606: }
                   1607: 
                   1608: /**
                   1609:  * r = p
                   1610:  */
                   1611: static void ge_p3_to_p2(ge_p2 *r, const ge_p3 *p)
                   1612: {
                   1613:        fe_copy(r->X,p->X);
                   1614:        fe_copy(r->Y,p->Y);
                   1615:        fe_copy(r->Z,p->Z);
                   1616: }
                   1617: 
                   1618: void ge_p3_tobytes(uint8_t *s, const ge_p3 *h)
                   1619: {
                   1620:        fe recip, x, y;
                   1621: 
                   1622:        fe_invert(recip,h->Z);
                   1623:        fe_mul(x,h->X,recip);
                   1624:        fe_mul(y,h->Y,recip);
                   1625:        fe_tobytes(s,y);
                   1626: 
                   1627:        s[31] ^= fe_isnegative(x) << 7;
                   1628: }
                   1629: 
                   1630: /**
                   1631:  * r = 2 * p
                   1632:  */
                   1633: static void ge_p3_dbl(ge_p1p1 *r, const ge_p3 *p)
                   1634: {
                   1635:        ge_p2 q;
                   1636:        ge_p3_to_p2(&q,p);
                   1637:        ge_p2_dbl(r,&q);
                   1638: }
                   1639: 
                   1640: static void ge_precomp_0(ge_precomp *h)
                   1641: {
                   1642:        fe_1(h->yplusx);
                   1643:        fe_1(h->yminusx);
                   1644:        fe_0(h->xy2d);
                   1645: }
                   1646: 
                   1647: static uint8_t equal(int8_t b, int8_t c)
                   1648: {
                   1649:        uint8_t ub = b;
                   1650:        uint8_t uc = c;
                   1651:        uint8_t x = ub ^ uc;  /* 0: yes; 1..255: no */
                   1652:        uint32_t y = x;       /* 0: yes; 1..255: no */
                   1653: 
                   1654:        y -= 1;      /* 4294967295: yes; 0..254: no */
                   1655:        y >>= 31;    /* 1: yes; 0: no */
                   1656: 
                   1657:        return y;
                   1658: }
                   1659: 
                   1660: static uint8_t negative(int8_t b)
                   1661: {
                   1662:        uint64_t x = b; /* 18446744073709551361..18446744073709551615: yes; 0..255: no */
                   1663: 
                   1664:        x >>= 63;       /* 1: yes; 0: no */
                   1665: 
                   1666:        return x;
                   1667: }
                   1668: 
                   1669: static void cmov(ge_precomp *t, const ge_precomp *u, uint8_t b)
                   1670: {
                   1671:        fe_cmov(t->yplusx,u->yplusx,b);
                   1672:        fe_cmov(t->yminusx,u->yminusx,b);
                   1673:        fe_cmov(t->xy2d,u->xy2d,b);
                   1674: }
                   1675: 
                   1676: /**
                   1677:  * base[i][j] = (j+1)*256^i*B
                   1678:  */
                   1679: static const ge_precomp base[32][8] = {
                   1680: #include "base.h"
                   1681: };
                   1682: 
                   1683: static void ge_select(ge_precomp *t, int pos, int8_t b)
                   1684: {
                   1685:        ge_precomp minust;
                   1686:        uint8_t bnegative = negative(b);
                   1687:        uint8_t babs = b - (((-bnegative) & b) * ((int8_t) 1 << 1));
                   1688: 
                   1689:        ge_precomp_0(t);
                   1690:        cmov(t,&base[pos][0],equal(babs,1));
                   1691:        cmov(t,&base[pos][1],equal(babs,2));
                   1692:        cmov(t,&base[pos][2],equal(babs,3));
                   1693:        cmov(t,&base[pos][3],equal(babs,4));
                   1694:        cmov(t,&base[pos][4],equal(babs,5));
                   1695:        cmov(t,&base[pos][5],equal(babs,6));
                   1696:        cmov(t,&base[pos][6],equal(babs,7));
                   1697:        cmov(t,&base[pos][7],equal(babs,8));
                   1698:        fe_copy(minust.yplusx,t->yminusx);
                   1699:        fe_copy(minust.yminusx,t->yplusx);
                   1700:        fe_neg(minust.xy2d,t->xy2d);
                   1701:        cmov(t,&minust,bnegative);
                   1702: }
                   1703: 
                   1704: /**
                   1705:  *r = p - q
                   1706:  */
                   1707: static void ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q)
                   1708: {
                   1709:        fe t0;
                   1710: 
                   1711:        fe_add(r->X, p->Y, p->X);
                   1712:        fe_sub(r->Y, p->Y, p->X);
                   1713:        fe_mul(r->Z, r->X, q->YminusX);
                   1714:        fe_mul(r->Y, r->Y, q->YplusX);
                   1715:        fe_mul(r->T, q->T2d, p->T);
                   1716:        fe_mul(r->X, p->Z, q->Z);
                   1717:        fe_add(t0, r->X, r->X);
                   1718:        fe_sub(r->X, r->Z, r->Y);
                   1719:        fe_add(r->Y, r->Z, r->Y);
                   1720:        fe_sub(r->Z, t0, r->T);
                   1721:        fe_add(r->T, t0, r->T);
                   1722: }
                   1723: 
                   1724: void ge_tobytes(uint8_t *s, const ge_p2 *h)
                   1725: {
                   1726:        fe recip, x, y;
                   1727: 
                   1728:        fe_invert(recip,h->Z);
                   1729:        fe_mul(x,h->X,recip);
                   1730:        fe_mul(y,h->Y,recip);
                   1731:        fe_tobytes(s,y);
                   1732: 
                   1733:        s[31] ^= fe_isnegative(x) << 7;
                   1734: }
                   1735: 
                   1736: /**
                   1737:  * h = a * B
                   1738:  * where a = a[0]+256*a[1]+...+256^31 a[31]
                   1739:  * B is the Ed25519 base point (x,4/5) with x positive.
                   1740:  *
                   1741:  * Preconditions:
                   1742:  * a[31] <= 127
                   1743:  */
                   1744: 
                   1745: /**
                   1746:  * r = a * A + b * B
                   1747:  * where a = a[0]+256*a[1]+...+256^31 a[31].
                   1748:  * and b = b[0]+256*b[1]+...+256^31 b[31].
                   1749:  * B is the Ed25519 base point (x,4/5) with x positive.
                   1750:  */
                   1751: void ge_double_scalarmult_vartime(ge_p2 *r, const uint8_t *a, const ge_p3 *A,
                   1752:                                                                  const uint8_t *b)
                   1753: {
                   1754:        int8_t aslide[256];
                   1755:        int8_t bslide[256];
                   1756:        ge_cached Ai[8];     /* A,3A,5A,7A,9A,11A,13A,15A */
                   1757:        ge_p1p1 t;
                   1758:        ge_p3 u, A2;
                   1759:        int i;
                   1760: 
                   1761:        slide(aslide,a);
                   1762:        slide(bslide,b);
                   1763: 
                   1764:        ge_p3_to_cached(&Ai[0],A);
                   1765:        ge_p3_dbl(&t,A);
                   1766:        ge_p1p1_to_p3(&A2,&t);
                   1767: 
                   1768:        ge_add(&t,&A2,&Ai[0]);
                   1769:        ge_p1p1_to_p3(&u,&t);
                   1770:        ge_p3_to_cached(&Ai[1],&u);
                   1771: 
                   1772:        ge_add(&t,&A2,&Ai[1]);
                   1773:        ge_p1p1_to_p3(&u,&t);
                   1774:        ge_p3_to_cached(&Ai[2],&u);
                   1775: 
                   1776:        ge_add(&t,&A2,&Ai[2]);
                   1777:        ge_p1p1_to_p3(&u,&t);
                   1778:        ge_p3_to_cached(&Ai[3],&u);
                   1779: 
                   1780:        ge_add(&t,&A2,&Ai[3]);
                   1781:        ge_p1p1_to_p3(&u,&t);
                   1782:        ge_p3_to_cached(&Ai[4],&u);
                   1783: 
                   1784:        ge_add(&t,&A2,&Ai[4]);
                   1785:        ge_p1p1_to_p3(&u,&t);
                   1786:        ge_p3_to_cached(&Ai[5],&u);
                   1787: 
                   1788:        ge_add(&t,&A2,&Ai[5]);
                   1789:        ge_p1p1_to_p3(&u,&t);
                   1790:        ge_p3_to_cached(&Ai[6],&u);
                   1791: 
                   1792:        ge_add(&t,&A2,&Ai[6]);
                   1793:        ge_p1p1_to_p3(&u,&t);
                   1794:        ge_p3_to_cached(&Ai[7],&u);
                   1795: 
                   1796:        ge_p2_0(r);
                   1797: 
                   1798:        for (i = 255; i >= 0; --i)
                   1799:        {
                   1800:                if (aslide[i] || bslide[i])
                   1801:                {
                   1802:                        break;
                   1803:                }
                   1804:        }
                   1805: 
                   1806:        for (; i >= 0 ;--i)
                   1807:        {
                   1808:                ge_p2_dbl(&t,r);
                   1809: 
                   1810:                if (aslide[i] > 0)
                   1811:                {
                   1812:                        ge_p1p1_to_p3(&u,&t);
                   1813:                        ge_add(&t,&u,&Ai[aslide[i]/2]);
                   1814:                }
                   1815:                else if (aslide[i] < 0)
                   1816:                {
                   1817:                        ge_p1p1_to_p3(&u,&t);
                   1818:                        ge_sub(&t,&u,&Ai[(-aslide[i])/2]);
                   1819:                }
                   1820: 
                   1821:                if (bslide[i] > 0)
                   1822:                {
                   1823:                        ge_p1p1_to_p3(&u,&t);
                   1824:                        ge_madd(&t,&u,&Bi[bslide[i]/2]);
                   1825:                }
                   1826:                else if (bslide[i] < 0)
                   1827:                {
                   1828:                        ge_p1p1_to_p3(&u,&t);
                   1829:                        ge_msub(&t,&u,&Bi[(-bslide[i])/2]);
                   1830:                }
                   1831:                ge_p1p1_to_p2(r,&t);
                   1832:        }
                   1833: }
                   1834: 
                   1835: void ge_scalarmult_base(ge_p3 *h, const uint8_t *a)
                   1836: {
                   1837:        int8_t e[64];
                   1838:        int8_t carry = 0;
                   1839:        ge_p1p1 r;
                   1840:        ge_p2 s;
                   1841:        ge_precomp t;
                   1842:        int i;
                   1843: 
                   1844:        for (i = 0; i < 32; ++i)
                   1845:        {
                   1846:                e[2 * i + 0] = (a[i] >> 0) & 15;
                   1847:                e[2 * i + 1] = (a[i] >> 4) & 15;
                   1848:        }
                   1849:        /* each e[i] is between 0 and 15 */
                   1850:        /* e[63] is between 0 and 7 */
                   1851: 
                   1852:        for (i = 0; i < 63; ++i) {
                   1853:                e[i] += carry;
                   1854:                carry = e[i] + 8;
                   1855:                carry >>= 4;
                   1856:                e[i] -= carry * ((int8_t) 1 << 4);
                   1857:        }
                   1858:        e[63] += carry;
                   1859:        /* each e[i] is between -8 and 8 */
                   1860: 
                   1861:        ge_p3_0(h);
                   1862:        for (i = 1; i < 64; i += 2)
                   1863:        {
                   1864:                ge_select(&t,i / 2,e[i]);
                   1865:                ge_madd(&r,h,&t); ge_p1p1_to_p3(h,&r);
                   1866:        }
                   1867: 
                   1868:        ge_p3_dbl(&r,h);  ge_p1p1_to_p2(&s,&r);
                   1869:        ge_p2_dbl(&r,&s); ge_p1p1_to_p2(&s,&r);
                   1870:        ge_p2_dbl(&r,&s); ge_p1p1_to_p2(&s,&r);
                   1871:        ge_p2_dbl(&r,&s); ge_p1p1_to_p3(h,&r);
                   1872: 
                   1873:        for (i = 0; i < 64; i += 2)
                   1874:        {
                   1875:                ge_select(&t,i / 2,e[i]);
                   1876:                ge_madd(&r,h,&t);
                   1877:                ge_p1p1_to_p3(h,&r);
                   1878:        }
                   1879: }
                   1880: 
                   1881: /**
                   1882:  * Input:
                   1883:  * a[0]+256*a[1]+...+256^31*a[31] = a
                   1884:  * b[0]+256*b[1]+...+256^31*b[31] = b
                   1885:  * c[0]+256*c[1]+...+256^31*c[31] = c
                   1886:  *
                   1887:  * Output:
                   1888:  * s[0]+256*s[1]+...+256^31*s[31] = (ab+c) mod l
                   1889:  * where l = 2^252 + 27742317777372353535851937790883648493.
                   1890:  */
                   1891: void sc_muladd(uint8_t *s, const uint8_t *a, const uint8_t *b, const uint8_t *c)
                   1892: {
                   1893:        int64_t a0  = 2097151 & load_3(a);
                   1894:        int64_t a1  = 2097151 & (load_4(a + 2) >> 5);
                   1895:        int64_t a2  = 2097151 & (load_3(a + 5) >> 2);
                   1896:        int64_t a3  = 2097151 & (load_4(a + 7) >> 7);
                   1897:        int64_t a4  = 2097151 & (load_4(a + 10) >> 4);
                   1898:        int64_t a5  = 2097151 & (load_3(a + 13) >> 1);
                   1899:        int64_t a6  = 2097151 & (load_4(a + 15) >> 6);
                   1900:        int64_t a7  = 2097151 & (load_3(a + 18) >> 3);
                   1901:        int64_t a8  = 2097151 & load_3(a + 21);
                   1902:        int64_t a9  = 2097151 & (load_4(a + 23) >> 5);
                   1903:        int64_t a10 = 2097151 & (load_3(a + 26) >> 2);
                   1904:        int64_t a11 = (load_4(a + 28) >> 7);
                   1905: 
                   1906:        int64_t b0  = 2097151 & load_3(b);
                   1907:        int64_t b1  = 2097151 & (load_4(b + 2) >> 5);
                   1908:        int64_t b2  = 2097151 & (load_3(b + 5) >> 2);
                   1909:        int64_t b3  = 2097151 & (load_4(b + 7) >> 7);
                   1910:        int64_t b4  = 2097151 & (load_4(b + 10) >> 4);
                   1911:        int64_t b5  = 2097151 & (load_3(b + 13) >> 1);
                   1912:        int64_t b6  = 2097151 & (load_4(b + 15) >> 6);
                   1913:        int64_t b7  = 2097151 & (load_3(b + 18) >> 3);
                   1914:        int64_t b8  = 2097151 & load_3(b + 21);
                   1915:        int64_t b9  = 2097151 & (load_4(b + 23) >> 5);
                   1916:        int64_t b10 = 2097151 & (load_3(b + 26) >> 2);
                   1917:        int64_t b11 = (load_4(b + 28) >> 7);
                   1918: 
                   1919:        int64_t c0  = 2097151 & load_3(c);
                   1920:        int64_t c1  = 2097151 & (load_4(c + 2) >> 5);
                   1921:        int64_t c2  = 2097151 & (load_3(c + 5) >> 2);
                   1922:        int64_t c3  = 2097151 & (load_4(c + 7) >> 7);
                   1923:        int64_t c4  = 2097151 & (load_4(c + 10) >> 4);
                   1924:        int64_t c5  = 2097151 & (load_3(c + 13) >> 1);
                   1925:        int64_t c6  = 2097151 & (load_4(c + 15) >> 6);
                   1926:        int64_t c7  = 2097151 & (load_3(c + 18) >> 3);
                   1927:        int64_t c8  = 2097151 & load_3(c + 21);
                   1928:        int64_t c9  = 2097151 & (load_4(c + 23) >> 5);
                   1929:        int64_t c10 = 2097151 & (load_3(c + 26) >> 2);
                   1930:        int64_t c11 = (load_4(c + 28) >> 7);
                   1931: 
                   1932:        int64_t s0,  s1,  s2,  s3,  s4,  s5,  s6,  s7,  s8,  s9,  s10, s11;
                   1933:        int64_t s12, s13, s14, s15, s16, s17, s18, s19, s20, s21, s22, s23;
                   1934: 
                   1935:        int64_t carry0,  carry1,  carry2,   carry3, carry4,  carry5,  carry6;
                   1936:        int64_t carry7,  carry8,  carry9,  carry10, carry11, carry12, carry13;
                   1937:        int64_t carry14, carry15, carry16, carry17, carry18, carry19, carry20;
                   1938:        int64_t carry21, carry22;
                   1939: 
                   1940:        s0 = c0 + a0*b0;
                   1941:        s1 = c1 + a0*b1 + a1*b0;
                   1942:        s2 = c2 + a0*b2 + a1*b1 + a2*b0;
                   1943:        s3 = c3 + a0*b3 + a1*b2 + a2*b1 + a3*b0;
                   1944:        s4 = c4 + a0*b4 + a1*b3 + a2*b2 + a3*b1 + a4*b0;
                   1945:        s5 = c5 + a0*b5 + a1*b4 + a2*b3 + a3*b2 + a4*b1 + a5*b0;
                   1946:        s6 = c6 + a0*b6 + a1*b5 + a2*b4 + a3*b3 + a4*b2 + a5*b1 + a6*b0;
                   1947:        s7 = c7 + a0*b7 + a1*b6 + a2*b5 + a3*b4 + a4*b3 + a5*b2 + a6*b1 + a7*b0;
                   1948:        s8 = c8 + a0*b8 + a1*b7 + a2*b6 + a3*b5 + a4*b4 + a5*b3 + a6*b2 + a7*b1 + a8*b0;
                   1949:        s9 = c9 + a0*b9 + a1*b8 + a2*b7 + a3*b6 + a4*b5 + a5*b4 + a6*b3 + a7*b2 + a8*b1 + a9*b0;
                   1950:        s10 = c10 + a0*b10 + a1*b9 + a2*b8 + a3*b7 + a4*b6 + a5*b5 + a6*b4 + a7*b3 + a8*b2 + a9*b1 + a10*b0;
                   1951:        s11 = c11 + a0*b11 + a1*b10 + a2*b9 + a3*b8 + a4*b7 + a5*b6 + a6*b5 + a7*b4 + a8*b3 + a9*b2 + a10*b1 + a11*b0;
                   1952:        s12 = a1*b11 + a2*b10 + a3*b9 + a4*b8 + a5*b7 + a6*b6 + a7*b5 + a8*b4 + a9*b3 + a10*b2 + a11*b1;
                   1953:        s13 = a2*b11 + a3*b10 + a4*b9 + a5*b8 + a6*b7 + a7*b6 + a8*b5 + a9*b4 + a10*b3 + a11*b2;
                   1954:        s14 = a3*b11 + a4*b10 + a5*b9 + a6*b8 + a7*b7 + a8*b6 + a9*b5 + a10*b4 + a11*b3;
                   1955:        s15 = a4*b11 + a5*b10 + a6*b9 + a7*b8 + a8*b7 + a9*b6 + a10*b5 + a11*b4;
                   1956:        s16 = a5*b11 + a6*b10 + a7*b9 + a8*b8 + a9*b7 + a10*b6 + a11*b5;
                   1957:        s17 = a6*b11 + a7*b10 + a8*b9 + a9*b8 + a10*b7 + a11*b6;
                   1958:        s18 = a7*b11 + a8*b10 + a9*b9 + a10*b8 + a11*b7;
                   1959:        s19 = a8*b11 + a9*b10 + a10*b9 + a11*b8;
                   1960:        s20 = a9*b11 + a10*b10 + a11*b9;
                   1961:        s21 = a10*b11 + a11*b10;
                   1962:        s22 = a11*b11;
                   1963:        s23 = 0;
                   1964: 
                   1965:        carry0 = (s0 + (int64_t) (1L << 20)) >> 21;
                   1966:        s1 += carry0;
                   1967:        s0 -= carry0 * ((uint64_t) 1L << 21);
                   1968: 
                   1969:        carry2 = (s2 + (int64_t) (1L << 20)) >> 21;
                   1970:        s3 += carry2;
                   1971:        s2 -= carry2 * ((uint64_t) 1L << 21);
                   1972: 
                   1973:        carry4 = (s4 + (int64_t) (1L << 20)) >> 21;
                   1974:        s5 += carry4;
                   1975:        s4 -= carry4 * ((uint64_t) 1L << 21);
                   1976: 
                   1977:        carry6 = (s6 + (int64_t) (1L << 20)) >> 21;
                   1978:        s7 += carry6;
                   1979:        s6 -= carry6 * ((uint64_t) 1L << 21);
                   1980: 
                   1981:        carry8 = (s8 + (int64_t) (1L << 20)) >> 21;
                   1982:        s9 += carry8;
                   1983:        s8 -= carry8 * ((uint64_t) 1L << 21);
                   1984: 
                   1985:        carry10 = (s10 + (int64_t) (1L << 20)) >> 21;
                   1986:        s11 += carry10;
                   1987:        s10 -= carry10 * ((uint64_t) 1L << 21);
                   1988: 
                   1989:        carry12 = (s12 + (int64_t) (1L << 20)) >> 21;
                   1990:        s13 += carry12;
                   1991:        s12 -= carry12 * ((uint64_t) 1L << 21);
                   1992: 
                   1993:        carry14 = (s14 + (int64_t) (1L << 20)) >> 21;
                   1994:        s15 += carry14;
                   1995:        s14 -= carry14 * ((uint64_t) 1L << 21);
                   1996: 
                   1997:        carry16 = (s16 + (int64_t) (1L << 20)) >> 21;
                   1998:        s17 += carry16;
                   1999:        s16 -= carry16 * ((uint64_t) 1L << 21);
                   2000: 
                   2001:        carry18 = (s18 + (int64_t) (1L << 20)) >> 21;
                   2002:        s19 += carry18;
                   2003:        s18 -= carry18 * ((uint64_t) 1L << 21);
                   2004: 
                   2005:        carry20 = (s20 + (int64_t) (1L << 20)) >> 21;
                   2006:        s21 += carry20;
                   2007:        s20 -= carry20 * ((uint64_t) 1L << 21);
                   2008: 
                   2009:        carry22 = (s22 + (int64_t) (1L << 20)) >> 21;
                   2010:        s23 += carry22;
                   2011:        s22 -= carry22 * ((uint64_t) 1L << 21);
                   2012: 
                   2013:        carry1 = (s1 + (int64_t) (1L << 20)) >> 21;
                   2014:        s2 += carry1;
                   2015:        s1 -= carry1 * ((uint64_t) 1L << 21);
                   2016: 
                   2017:        carry3 = (s3 + (int64_t) (1L << 20)) >> 21;
                   2018:        s4 += carry3;
                   2019:        s3 -= carry3 * ((uint64_t) 1L << 21);
                   2020: 
                   2021:        carry5 = (s5 + (int64_t) (1L << 20)) >> 21;
                   2022:        s6 += carry5;
                   2023:        s5 -= carry5 * ((uint64_t) 1L << 21);
                   2024: 
                   2025:        carry7 = (s7 + (int64_t) (1L << 20)) >> 21;
                   2026:        s8 += carry7;
                   2027:        s7 -= carry7 * ((uint64_t) 1L << 21);
                   2028: 
                   2029:        carry9 = (s9 + (int64_t) (1L << 20)) >> 21;
                   2030:        s10 += carry9;
                   2031:        s9 -= carry9 * ((uint64_t) 1L << 21);
                   2032: 
                   2033:        carry11 = (s11 + (int64_t) (1L << 20)) >> 21;
                   2034:        s12 += carry11;
                   2035:        s11 -= carry11 * ((uint64_t) 1L << 21);
                   2036: 
                   2037:        carry13 = (s13 + (int64_t) (1L << 20)) >> 21;
                   2038:        s14 += carry13;
                   2039:        s13 -= carry13 * ((uint64_t) 1L << 21);
                   2040: 
                   2041:        carry15 = (s15 + (int64_t) (1L << 20)) >> 21;
                   2042:        s16 += carry15;
                   2043:        s15 -= carry15 * ((uint64_t) 1L << 21);
                   2044: 
                   2045:        carry17 = (s17 + (int64_t) (1L << 20)) >> 21;
                   2046:        s18 += carry17;
                   2047:        s17 -= carry17 * ((uint64_t) 1L << 21);
                   2048: 
                   2049:        carry19 = (s19 + (int64_t) (1L << 20)) >> 21;
                   2050:        s20 += carry19;
                   2051:        s19 -= carry19 * ((uint64_t) 1L << 21);
                   2052: 
                   2053:        carry21 = (s21 + (int64_t) (1L << 20)) >> 21;
                   2054:        s22 += carry21;
                   2055:        s21 -= carry21 * ((uint64_t) 1L << 21);
                   2056: 
                   2057:        s11 += s23 * 666643;
                   2058:        s12 += s23 * 470296;
                   2059:        s13 += s23 * 654183;
                   2060:        s14 -= s23 * 997805;
                   2061:        s15 += s23 * 136657;
                   2062:        s16 -= s23 * 683901;
                   2063: 
                   2064:        s10 += s22 * 666643;
                   2065:        s11 += s22 * 470296;
                   2066:        s12 += s22 * 654183;
                   2067:        s13 -= s22 * 997805;
                   2068:        s14 += s22 * 136657;
                   2069:        s15 -= s22 * 683901;
                   2070: 
                   2071:        s9  += s21 * 666643;
                   2072:        s10 += s21 * 470296;
                   2073:        s11 += s21 * 654183;
                   2074:        s12 -= s21 * 997805;
                   2075:        s13 += s21 * 136657;
                   2076:        s14 -= s21 * 683901;
                   2077: 
                   2078:        s8  += s20 * 666643;
                   2079:        s9  += s20 * 470296;
                   2080:        s10 += s20 * 654183;
                   2081:        s11 -= s20 * 997805;
                   2082:        s12 += s20 * 136657;
                   2083:        s13 -= s20 * 683901;
                   2084: 
                   2085:        s7  += s19 * 666643;
                   2086:        s8  += s19 * 470296;
                   2087:        s9  += s19 * 654183;
                   2088:        s10 -= s19 * 997805;
                   2089:        s11 += s19 * 136657;
                   2090:        s12 -= s19 * 683901;
                   2091: 
                   2092:        s6  += s18 * 666643;
                   2093:        s7  += s18 * 470296;
                   2094:        s8  += s18 * 654183;
                   2095:        s9  -= s18 * 997805;
                   2096:        s10 += s18 * 136657;
                   2097:        s11 -= s18 * 683901;
                   2098: 
                   2099:        carry6 = (s6 + (int64_t) (1L << 20)) >> 21;
                   2100:        s7 += carry6;
                   2101:        s6 -= carry6 * ((uint64_t) 1L << 21);
                   2102: 
                   2103:        carry8 = (s8 + (int64_t) (1L << 20)) >> 21;
                   2104:        s9 += carry8;
                   2105:        s8 -= carry8 * ((uint64_t) 1L << 21);
                   2106: 
                   2107:        carry10 = (s10 + (int64_t) (1L << 20)) >> 21;
                   2108:        s11 += carry10;
                   2109:        s10 -= carry10 * ((uint64_t) 1L << 21);
                   2110: 
                   2111:        carry12 = (s12 + (int64_t) (1L << 20)) >> 21;
                   2112:        s13 += carry12;
                   2113:        s12 -= carry12 * ((uint64_t) 1L << 21);
                   2114: 
                   2115:        carry14 = (s14 + (int64_t) (1L << 20)) >> 21;
                   2116:        s15 += carry14;
                   2117:        s14 -= carry14 * ((uint64_t) 1L << 21);
                   2118: 
                   2119:        carry16 = (s16 + (int64_t) (1L << 20)) >> 21;
                   2120:        s17 += carry16;
                   2121:        s16 -= carry16 * ((uint64_t) 1L << 21);
                   2122: 
                   2123:        carry7 = (s7 + (int64_t) (1L << 20)) >> 21;
                   2124:        s8 += carry7;
                   2125:        s7 -= carry7 * ((uint64_t) 1L << 21);
                   2126: 
                   2127:        carry9 = (s9 + (int64_t) (1L << 20)) >> 21;
                   2128:        s10 += carry9;
                   2129:        s9 -= carry9 * ((uint64_t) 1L << 21);
                   2130: 
                   2131:        carry11 = (s11 + (int64_t) (1L << 20)) >> 21;
                   2132:        s12 += carry11;
                   2133:        s11 -= carry11 * ((uint64_t) 1L << 21);
                   2134: 
                   2135:        carry13 = (s13 + (int64_t) (1L << 20)) >> 21;
                   2136:        s14 += carry13;
                   2137:        s13 -= carry13 * ((uint64_t) 1L << 21);
                   2138: 
                   2139:        carry15 = (s15 + (int64_t) (1L << 20)) >> 21;
                   2140:        s16 += carry15;
                   2141:        s15 -= carry15 * ((uint64_t) 1L << 21);
                   2142: 
                   2143:        s5  += s17 * 666643;
                   2144:        s6  += s17 * 470296;
                   2145:        s7  += s17 * 654183;
                   2146:        s8  -= s17 * 997805;
                   2147:        s9  += s17 * 136657;
                   2148:        s10 -= s17 * 683901;
                   2149: 
                   2150:        s4  += s16 * 666643;
                   2151:        s5  += s16 * 470296;
                   2152:        s6  += s16 * 654183;
                   2153:        s7  -= s16 * 997805;
                   2154:        s8  += s16 * 136657;
                   2155:        s9  -= s16 * 683901;
                   2156: 
                   2157:        s3  += s15 * 666643;
                   2158:        s4  += s15 * 470296;
                   2159:        s5  += s15 * 654183;
                   2160:        s6  -= s15 * 997805;
                   2161:        s7  += s15 * 136657;
                   2162:        s8  -= s15 * 683901;
                   2163: 
                   2164:        s2  += s14 * 666643;
                   2165:        s3  += s14 * 470296;
                   2166:        s4  += s14 * 654183;
                   2167:        s5  -= s14 * 997805;
                   2168:        s6  += s14 * 136657;
                   2169:        s7  -= s14 * 683901;
                   2170: 
                   2171:        s1  += s13 * 666643;
                   2172:        s2  += s13 * 470296;
                   2173:        s3  += s13 * 654183;
                   2174:        s4  -= s13 * 997805;
                   2175:        s5  += s13 * 136657;
                   2176:        s6  -= s13 * 683901;
                   2177: 
                   2178:        s0  += s12 * 666643;
                   2179:        s1  += s12 * 470296;
                   2180:        s2  += s12 * 654183;
                   2181:        s3  -= s12 * 997805;
                   2182:        s4  += s12 * 136657;
                   2183:        s5  -= s12 * 683901;
                   2184:        s12 = 0;
                   2185: 
                   2186:        carry0 = (s0 + (int64_t) (1L << 20)) >> 21;
                   2187:        s1 += carry0;
                   2188:        s0 -= carry0 * ((uint64_t) 1L << 21);
                   2189: 
                   2190:        carry2 = (s2 + (int64_t) (1L << 20)) >> 21;
                   2191:        s3 += carry2;
                   2192:        s2 -= carry2 * ((uint64_t) 1L << 21);
                   2193: 
                   2194:        carry4 = (s4 + (int64_t) (1L << 20)) >> 21;
                   2195:        s5 += carry4;
                   2196:        s4 -= carry4 * ((uint64_t) 1L << 21);
                   2197: 
                   2198:        carry6 = (s6 + (int64_t) (1L << 20)) >> 21;
                   2199:        s7 += carry6;
                   2200:        s6 -= carry6 * ((uint64_t) 1L << 21);
                   2201: 
                   2202:        carry8 = (s8 + (int64_t) (1L << 20)) >> 21;
                   2203:        s9 += carry8;
                   2204:        s8 -= carry8 * ((uint64_t) 1L << 21);
                   2205: 
                   2206:        carry10 = (s10 + (int64_t) (1L << 20)) >> 21;
                   2207:        s11 += carry10;
                   2208:        s10 -= carry10 * ((uint64_t) 1L << 21);
                   2209: 
                   2210:        carry1 = (s1 + (int64_t) (1L << 20)) >> 21;
                   2211:        s2 += carry1;
                   2212:        s1 -= carry1 * ((uint64_t) 1L << 21);
                   2213: 
                   2214:        carry3 = (s3 + (int64_t) (1L << 20)) >> 21;
                   2215:        s4 += carry3;
                   2216:        s3 -= carry3 * ((uint64_t) 1L << 21);
                   2217: 
                   2218:        carry5 = (s5 + (int64_t) (1L << 20)) >> 21;
                   2219:        s6 += carry5;
                   2220:        s5 -= carry5 * ((uint64_t) 1L << 21);
                   2221: 
                   2222:        carry7 = (s7 + (int64_t) (1L << 20)) >> 21;
                   2223:        s8 += carry7;
                   2224:        s7 -= carry7 * ((uint64_t) 1L << 21);
                   2225: 
                   2226:        carry9 = (s9 + (int64_t) (1L << 20)) >> 21;
                   2227:        s10 += carry9;
                   2228:        s9 -= carry9 * ((uint64_t) 1L << 21);
                   2229: 
                   2230:        carry11 = (s11 + (int64_t) (1L << 20)) >> 21;
                   2231:        s12 += carry11;
                   2232:        s11 -= carry11 * ((uint64_t) 1L << 21);
                   2233: 
                   2234:        s0 += s12 * 666643;
                   2235:        s1 += s12 * 470296;
                   2236:        s2 += s12 * 654183;
                   2237:        s3 -= s12 * 997805;
                   2238:        s4 += s12 * 136657;
                   2239:        s5 -= s12 * 683901;
                   2240:        s12 = 0;
                   2241: 
                   2242:        carry0 = s0 >> 21;
                   2243:        s1 += carry0;
                   2244:        s0 -= carry0 * ((uint64_t) 1L << 21);
                   2245: 
                   2246:        carry1 = s1 >> 21;
                   2247:        s2 += carry1;
                   2248:        s1 -= carry1 * ((uint64_t) 1L << 21);
                   2249: 
                   2250:        carry2 = s2 >> 21;
                   2251:        s3 += carry2;
                   2252:        s2 -= carry2 * ((uint64_t) 1L << 21);
                   2253: 
                   2254:        carry3 = s3 >> 21;
                   2255:        s4 += carry3;
                   2256:        s3 -= carry3 * ((uint64_t) 1L << 21);
                   2257: 
                   2258:        carry4 = s4 >> 21;
                   2259:        s5 += carry4;
                   2260:        s4 -= carry4 * ((uint64_t) 1L << 21);
                   2261: 
                   2262:        carry5 = s5 >> 21;
                   2263:        s6 += carry5;
                   2264:        s5 -= carry5 * ((uint64_t) 1L << 21);
                   2265: 
                   2266:        carry6 = s6 >> 21;
                   2267:        s7 += carry6;
                   2268:        s6 -= carry6 * ((uint64_t) 1L << 21);
                   2269: 
                   2270:        carry7 = s7 >> 21;
                   2271:        s8 += carry7;
                   2272:        s7 -= carry7 * ((uint64_t) 1L << 21);
                   2273: 
                   2274:        carry8 = s8 >> 21;
                   2275:        s9 += carry8;
                   2276:        s8 -= carry8 * ((uint64_t) 1L << 21);
                   2277: 
                   2278:        carry9 = s9 >> 21;
                   2279:        s10 += carry9;
                   2280:        s9 -= carry9 * ((uint64_t) 1L << 21);
                   2281: 
                   2282:        carry10 = s10 >> 21;
                   2283:        s11 += carry10;
                   2284:        s10 -= carry10 * ((uint64_t) 1L << 21);
                   2285: 
                   2286:        carry11 = s11 >> 21;
                   2287:        s12 += carry11;
                   2288:        s11 -= carry11 * ((uint64_t) 1L << 21);
                   2289: 
                   2290:        s0 += s12 * 666643;
                   2291:        s1 += s12 * 470296;
                   2292:        s2 += s12 * 654183;
                   2293:        s3 -= s12 * 997805;
                   2294:        s4 += s12 * 136657;
                   2295:        s5 -= s12 * 683901;
                   2296: 
                   2297:        carry0 = s0 >> 21;
                   2298:        s1 += carry0;
                   2299:        s0 -= carry0 * ((uint64_t) 1L << 21);
                   2300: 
                   2301:        carry1 = s1 >> 21;
                   2302:        s2 += carry1;
                   2303:        s1 -= carry1 * ((uint64_t) 1L << 21);
                   2304: 
                   2305:        carry2 = s2 >> 21;
                   2306:        s3 += carry2;
                   2307:        s2 -= carry2 * ((uint64_t) 1L << 21);
                   2308: 
                   2309:        carry3 = s3 >> 21;
                   2310:        s4 += carry3;
                   2311:        s3 -= carry3 * ((uint64_t) 1L << 21);
                   2312: 
                   2313:        carry4 = s4 >> 21;
                   2314:        s5 += carry4;
                   2315:        s4 -= carry4 * ((uint64_t) 1L << 21);
                   2316: 
                   2317:        carry5 = s5 >> 21;
                   2318:        s6 += carry5;
                   2319:        s5 -= carry5 * ((uint64_t) 1L << 21);
                   2320: 
                   2321:        carry6 = s6 >> 21;
                   2322:        s7 += carry6;
                   2323:        s6 -= carry6 * ((uint64_t) 1L << 21);
                   2324: 
                   2325:        carry7 = s7 >> 21;
                   2326:        s8 += carry7;
                   2327:        s7 -= carry7 * ((uint64_t) 1L << 21);
                   2328: 
                   2329:        carry8 = s8 >> 21;
                   2330:        s9 += carry8;
                   2331:        s8 -= carry8 * ((uint64_t) 1L << 21);
                   2332: 
                   2333:        carry9 = s9 >> 21;
                   2334:        s10 += carry9;
                   2335:        s9 -= carry9 * ((uint64_t) 1L << 21);
                   2336: 
                   2337:        carry10 = s10 >> 21;
                   2338:        s11 += carry10;
                   2339:        s10 -= carry10 * ((uint64_t) 1L << 21);
                   2340: 
                   2341:        s[0]  = s0 >> 0;
                   2342:        s[1]  = s0 >> 8;
                   2343:        s[2]  = (s0 >> 16) | (s1 * ((uint64_t) 1 << 5));
                   2344:        s[3]  = s1 >> 3;
                   2345:        s[4]  = s1 >> 11;
                   2346:        s[5]  = (s1 >> 19) | (s2 * ((uint64_t) 1 << 2));
                   2347:        s[6]  = s2 >> 6;
                   2348:        s[7]  = (s2 >> 14) | (s3 * ((uint64_t) 1 << 7));
                   2349:        s[8]  = s3 >> 1;
                   2350:        s[9]  = s3 >> 9;
                   2351:        s[10] = (s3 >> 17) | (s4 * ((uint64_t) 1 << 4));
                   2352:        s[11] = s4 >> 4;
                   2353:        s[12] = s4 >> 12;
                   2354:        s[13] = (s4 >> 20) | (s5 * ((uint64_t) 1 << 1));
                   2355:        s[14] = s5 >> 7;
                   2356:        s[15] = (s5 >> 15) | (s6 * ((uint64_t) 1 << 6));
                   2357:        s[16] = s6 >> 2;
                   2358:        s[17] = s6 >> 10;
                   2359:        s[18] = (s6 >> 18) | (s7 * ((uint64_t) 1 << 3));
                   2360:        s[19] = s7 >> 5;
                   2361:        s[20] = s7 >> 13;
                   2362:        s[21] = s8 >> 0;
                   2363:        s[22] = s8 >> 8;
                   2364:        s[23] = (s8 >> 16) | (s9 * ((uint64_t) 1 << 5));
                   2365:        s[24] = s9 >> 3;
                   2366:        s[25] = s9 >> 11;
                   2367:        s[26] = (s9 >> 19) | (s10 * ((uint64_t) 1 << 2));
                   2368:        s[27] = s10 >> 6;
                   2369:        s[28] = (s10 >> 14) | (s11 * ((uint64_t) 1 << 7));
                   2370:        s[29] = s11 >> 1;
                   2371:        s[30] = s11 >> 9;
                   2372:        s[31] = s11 >> 17;
                   2373: }
                   2374: 
                   2375: /**
                   2376:  * Input:
                   2377:  * s[0]+256*s[1]+...+256^63*s[63] = s
                   2378:  *
                   2379:  * Output:
                   2380:  * s[0]+256*s[1]+...+256^31*s[31] = s mod l
                   2381:  * where l = 2^252 + 27742317777372353535851937790883648493.
                   2382:  * Overwrites s in place.
                   2383:  */
                   2384: void sc_reduce(uint8_t *s)
                   2385: {
                   2386:        int64_t s0  = 2097151 & load_3(s);
                   2387:        int64_t s1  = 2097151 & (load_4(s + 2) >> 5);
                   2388:        int64_t s2  = 2097151 & (load_3(s + 5) >> 2);
                   2389:        int64_t s3  = 2097151 & (load_4(s + 7) >> 7);
                   2390:        int64_t s4  = 2097151 & (load_4(s + 10) >> 4);
                   2391:        int64_t s5  = 2097151 & (load_3(s + 13) >> 1);
                   2392:        int64_t s6  = 2097151 & (load_4(s + 15) >> 6);
                   2393:        int64_t s7  = 2097151 & (load_3(s + 18) >> 3);
                   2394:        int64_t s8  = 2097151 & load_3(s + 21);
                   2395:        int64_t s9  = 2097151 & (load_4(s + 23) >> 5);
                   2396:        int64_t s10 = 2097151 & (load_3(s + 26) >> 2);
                   2397:        int64_t s11 = 2097151 & (load_4(s + 28) >> 7);
                   2398:        int64_t s12 = 2097151 & (load_4(s + 31) >> 4);
                   2399:        int64_t s13 = 2097151 & (load_3(s + 34) >> 1);
                   2400:        int64_t s14 = 2097151 & (load_4(s + 36) >> 6);
                   2401:        int64_t s15 = 2097151 & (load_3(s + 39) >> 3);
                   2402:        int64_t s16 = 2097151 & load_3(s + 42);
                   2403:        int64_t s17 = 2097151 & (load_4(s + 44) >> 5);
                   2404:        int64_t s18 = 2097151 & (load_3(s + 47) >> 2);
                   2405:        int64_t s19 = 2097151 & (load_4(s + 49) >> 7);
                   2406:        int64_t s20 = 2097151 & (load_4(s + 52) >> 4);
                   2407:        int64_t s21 = 2097151 & (load_3(s + 55) >> 1);
                   2408:        int64_t s22 = 2097151 & (load_4(s + 57) >> 6);
                   2409:        int64_t s23 = (load_4(s + 60) >> 3);
                   2410: 
                   2411:        int64_t carry0,  carry1,  carry2,   carry3,  carry4,  carry5,  carry6;
                   2412:        int64_t carry7,  carry8,  carry9,  carry10, carry11, carry12, carry13;
                   2413:        int64_t carry14, carry15, carry16;
                   2414: 
                   2415:        s11 += s23 * 666643;
                   2416:        s12 += s23 * 470296;
                   2417:        s13 += s23 * 654183;
                   2418:        s14 -= s23 * 997805;
                   2419:        s15 += s23 * 136657;
                   2420:        s16 -= s23 * 683901;
                   2421: 
                   2422:        s10 += s22 * 666643;
                   2423:        s11 += s22 * 470296;
                   2424:        s12 += s22 * 654183;
                   2425:        s13 -= s22 * 997805;
                   2426:        s14 += s22 * 136657;
                   2427:        s15 -= s22 * 683901;
                   2428: 
                   2429:        s9  += s21 * 666643;
                   2430:        s10 += s21 * 470296;
                   2431:        s11 += s21 * 654183;
                   2432:        s12 -= s21 * 997805;
                   2433:        s13 += s21 * 136657;
                   2434:        s14 -= s21 * 683901;
                   2435: 
                   2436:        s8  += s20 * 666643;
                   2437:        s9  += s20 * 470296;
                   2438:        s10 += s20 * 654183;
                   2439:        s11 -= s20 * 997805;
                   2440:        s12 += s20 * 136657;
                   2441:        s13 -= s20 * 683901;
                   2442: 
                   2443:        s7  += s19 * 666643;
                   2444:        s8  += s19 * 470296;
                   2445:        s9  += s19 * 654183;
                   2446:        s10 -= s19 * 997805;
                   2447:        s11 += s19 * 136657;
                   2448:        s12 -= s19 * 683901;
                   2449: 
                   2450:        s6  += s18 * 666643;
                   2451:        s7  += s18 * 470296;
                   2452:        s8  += s18 * 654183;
                   2453:        s9  -= s18 * 997805;
                   2454:        s10 += s18 * 136657;
                   2455:        s11 -= s18 * 683901;
                   2456: 
                   2457:        carry6 = (s6 + (int64_t) (1L << 20)) >> 21;
                   2458:        s7 += carry6;
                   2459:        s6 -= carry6 * ((uint64_t) 1L << 21);
                   2460: 
                   2461:        carry8 = (s8 + (int64_t) (1L << 20)) >> 21;
                   2462:        s9 += carry8;
                   2463:        s8 -= carry8 * ((uint64_t) 1L << 21);
                   2464: 
                   2465:        carry10 = (s10 + (int64_t) (1L << 20)) >> 21;
                   2466:        s11 += carry10;
                   2467:        s10 -= carry10 * ((uint64_t) 1L << 21);
                   2468: 
                   2469:        carry12 = (s12 + (int64_t) (1L << 20)) >> 21;
                   2470:        s13 += carry12;
                   2471:        s12 -= carry12 * ((uint64_t) 1L << 21);
                   2472: 
                   2473:        carry14 = (s14 + (int64_t) (1L << 20)) >> 21;
                   2474:        s15 += carry14;
                   2475:        s14 -= carry14 * ((uint64_t) 1L << 21);
                   2476: 
                   2477:        carry16 = (s16 + (int64_t) (1L << 20)) >> 21;
                   2478:        s17 += carry16;
                   2479:        s16 -= carry16 * ((uint64_t) 1L << 21);
                   2480: 
                   2481:        carry7 = (s7 + (int64_t) (1L << 20)) >> 21;
                   2482:        s8 += carry7;
                   2483:        s7 -= carry7 * ((uint64_t) 1L << 21);
                   2484: 
                   2485:        carry9 = (s9 + (int64_t) (1L << 20)) >> 21;
                   2486:        s10 += carry9;
                   2487:        s9 -= carry9 * ((uint64_t) 1L << 21);
                   2488: 
                   2489:        carry11 = (s11 + (int64_t) (1L << 20)) >> 21;
                   2490:        s12 += carry11;
                   2491:        s11 -= carry11 * ((uint64_t) 1L << 21);
                   2492: 
                   2493:        carry13 = (s13 + (int64_t) (1L << 20)) >> 21;
                   2494:        s14 += carry13;
                   2495:        s13 -= carry13 * ((uint64_t) 1L << 21);
                   2496: 
                   2497:        carry15 = (s15 + (int64_t) (1L << 20)) >> 21;
                   2498:        s16 += carry15;
                   2499:        s15 -= carry15 * ((uint64_t) 1L << 21);
                   2500: 
                   2501:        s5  += s17 * 666643;
                   2502:        s6  += s17 * 470296;
                   2503:        s7  += s17 * 654183;
                   2504:        s8  -= s17 * 997805;
                   2505:        s9  += s17 * 136657;
                   2506:        s10 -= s17 * 683901;
                   2507: 
                   2508:        s4  += s16 * 666643;
                   2509:        s5  += s16 * 470296;
                   2510:        s6  += s16 * 654183;
                   2511:        s7  -= s16 * 997805;
                   2512:        s8  += s16 * 136657;
                   2513:        s9  -= s16 * 683901;
                   2514: 
                   2515:        s3  += s15 * 666643;
                   2516:        s4  += s15 * 470296;
                   2517:        s5  += s15 * 654183;
                   2518:        s6  -= s15 * 997805;
                   2519:        s7  += s15 * 136657;
                   2520:        s8  -= s15 * 683901;
                   2521: 
                   2522:        s2  += s14 * 666643;
                   2523:        s3  += s14 * 470296;
                   2524:        s4  += s14 * 654183;
                   2525:        s5  -= s14 * 997805;
                   2526:        s6  += s14 * 136657;
                   2527:        s7  -= s14 * 683901;
                   2528: 
                   2529:        s1  += s13 * 666643;
                   2530:        s2  += s13 * 470296;
                   2531:        s3  += s13 * 654183;
                   2532:        s4  -= s13 * 997805;
                   2533:        s5  += s13 * 136657;
                   2534:        s6  -= s13 * 683901;
                   2535: 
                   2536:        s0  += s12 * 666643;
                   2537:        s1  += s12 * 470296;
                   2538:        s2  += s12 * 654183;
                   2539:        s3  -= s12 * 997805;
                   2540:        s4  += s12 * 136657;
                   2541:        s5  -= s12 * 683901;
                   2542:        s12 = 0;
                   2543: 
                   2544:        carry0 = (s0 + (int64_t) (1L << 20)) >> 21;
                   2545:        s1 += carry0;
                   2546:        s0 -= carry0 * ((uint64_t) 1L << 21);
                   2547: 
                   2548:        carry2 = (s2 + (int64_t) (1L << 20)) >> 21;
                   2549:        s3 += carry2;
                   2550:        s2 -= carry2 * ((uint64_t) 1L << 21);
                   2551: 
                   2552:        carry4 = (s4 + (int64_t) (1L << 20)) >> 21;
                   2553:        s5 += carry4;
                   2554:        s4 -= carry4 * ((uint64_t) 1L << 21);
                   2555: 
                   2556:        carry6 = (s6 + (int64_t) (1L << 20)) >> 21;
                   2557:        s7 += carry6;
                   2558:        s6 -= carry6 * ((uint64_t) 1L << 21);
                   2559: 
                   2560:        carry8 = (s8 + (int64_t) (1L << 20)) >> 21;
                   2561:        s9 += carry8;
                   2562:        s8 -= carry8 * ((uint64_t) 1L << 21);
                   2563: 
                   2564:        carry10 = (s10 + (int64_t) (1L << 20)) >> 21;
                   2565:        s11 += carry10;
                   2566:        s10 -= carry10 * ((uint64_t) 1L << 21);
                   2567: 
                   2568:        carry1 = (s1 + (int64_t) (1L << 20)) >> 21;
                   2569:        s2 += carry1;
                   2570:        s1 -= carry1 * ((uint64_t) 1L << 21);
                   2571: 
                   2572:        carry3 = (s3 + (int64_t) (1L << 20)) >> 21;
                   2573:        s4 += carry3;
                   2574:        s3 -= carry3 * ((uint64_t) 1L << 21);
                   2575: 
                   2576:        carry5 = (s5 + (int64_t) (1L << 20)) >> 21;
                   2577:        s6 += carry5;
                   2578:        s5 -= carry5 * ((uint64_t) 1L << 21);
                   2579: 
                   2580:        carry7 = (s7 + (int64_t) (1L << 20)) >> 21;
                   2581:        s8 += carry7;
                   2582:        s7 -= carry7 * ((uint64_t) 1L << 21);
                   2583: 
                   2584:        carry9 = (s9 + (int64_t) (1L << 20)) >> 21;
                   2585:        s10 += carry9;
                   2586:        s9 -= carry9 * ((uint64_t) 1L << 21);
                   2587: 
                   2588:        carry11 = (s11 + (int64_t) (1L << 20)) >> 21;
                   2589:        s12 += carry11;
                   2590:        s11 -= carry11 * ((uint64_t) 1L << 21);
                   2591: 
                   2592:        s0 += s12 * 666643;
                   2593:        s1 += s12 * 470296;
                   2594:        s2 += s12 * 654183;
                   2595:        s3 -= s12 * 997805;
                   2596:        s4 += s12 * 136657;
                   2597:        s5 -= s12 * 683901;
                   2598:        s12 = 0;
                   2599: 
                   2600:        carry0 = s0 >> 21;
                   2601:        s1 += carry0;
                   2602:        s0 -= carry0 * ((uint64_t) 1L << 21);
                   2603: 
                   2604:        carry1 = s1 >> 21;
                   2605:        s2 += carry1;
                   2606:        s1 -= carry1 * ((uint64_t) 1L << 21);
                   2607: 
                   2608:        carry2 = s2 >> 21;
                   2609:        s3 += carry2;
                   2610:        s2 -= carry2 * ((uint64_t) 1L << 21);
                   2611: 
                   2612:        carry3 = s3 >> 21;
                   2613:        s4 += carry3;
                   2614:        s3 -= carry3 * ((uint64_t) 1L << 21);
                   2615: 
                   2616:        carry4 = s4 >> 21;
                   2617:        s5 += carry4;
                   2618:        s4 -= carry4 * ((uint64_t) 1L << 21);
                   2619: 
                   2620:        carry5 = s5 >> 21;
                   2621:        s6 += carry5;
                   2622:        s5 -= carry5 * ((uint64_t) 1L << 21);
                   2623: 
                   2624:        carry6 = s6 >> 21;
                   2625:        s7 += carry6;
                   2626:        s6 -= carry6 * ((uint64_t) 1L << 21);
                   2627: 
                   2628:        carry7 = s7 >> 21;
                   2629:        s8 += carry7;
                   2630:        s7 -= carry7 * ((uint64_t) 1L << 21);
                   2631: 
                   2632:        carry8 = s8 >> 21;
                   2633:        s9 += carry8;
                   2634:        s8 -= carry8 * ((uint64_t) 1L << 21);
                   2635: 
                   2636:        carry9 = s9 >> 21;
                   2637:        s10 += carry9;
                   2638:        s9 -= carry9 * ((uint64_t) 1L << 21);
                   2639: 
                   2640:        carry10 = s10 >> 21;
                   2641:        s11 += carry10;
                   2642:        s10 -= carry10 * ((uint64_t) 1L << 21);
                   2643: 
                   2644:        carry11 = s11 >> 21;
                   2645:        s12 += carry11;
                   2646:        s11 -= carry11 * ((uint64_t) 1L << 21);
                   2647: 
                   2648:        s0 += s12 * 666643;
                   2649:        s1 += s12 * 470296;
                   2650:        s2 += s12 * 654183;
                   2651:        s3 -= s12 * 997805;
                   2652:        s4 += s12 * 136657;
                   2653:        s5 -= s12 * 683901;
                   2654: 
                   2655:        carry0 = s0 >> 21;
                   2656:        s1 += carry0;
                   2657:        s0 -= carry0 * ((uint64_t) 1L << 21);
                   2658: 
                   2659:        carry1 = s1 >> 21;
                   2660:        s2 += carry1;
                   2661:        s1 -= carry1 * ((uint64_t) 1L << 21);
                   2662: 
                   2663:        carry2 = s2 >> 21;
                   2664:        s3 += carry2;
                   2665:        s2 -= carry2 * ((uint64_t) 1L << 21);
                   2666: 
                   2667:        carry3 = s3 >> 21;
                   2668:        s4 += carry3;
                   2669:        s3 -= carry3 * ((uint64_t) 1L << 21);
                   2670: 
                   2671:        carry4 = s4 >> 21;
                   2672:        s5 += carry4;
                   2673:        s4 -= carry4 * ((uint64_t) 1L << 21);
                   2674: 
                   2675:        carry5 = s5 >> 21;
                   2676:        s6 += carry5;
                   2677:        s5 -= carry5 * ((uint64_t) 1L << 21);
                   2678: 
                   2679:        carry6 = s6 >> 21;
                   2680:        s7 += carry6;
                   2681:        s6 -= carry6 * ((uint64_t) 1L << 21);
                   2682: 
                   2683:        carry7 = s7 >> 21;
                   2684:        s8 += carry7;
                   2685:        s7 -= carry7 * ((uint64_t) 1L << 21);
                   2686: 
                   2687:        carry8 = s8 >> 21;
                   2688:        s9 += carry8;
                   2689:        s8 -= carry8 * ((uint64_t) 1L << 21);
                   2690: 
                   2691:        carry9 = s9 >> 21;
                   2692:        s10 += carry9;
                   2693:        s9 -= carry9 * ((uint64_t) 1L << 21);
                   2694: 
                   2695:        carry10 = s10 >> 21;
                   2696:        s11 += carry10;
                   2697:        s10 -= carry10 * ((uint64_t) 1L << 21);
                   2698: 
                   2699:        s[0]  = s0 >> 0;
                   2700:        s[1]  = s0 >> 8;
                   2701:        s[2]  = (s0 >> 16) | (s1 * ((uint64_t) 1 << 5));
                   2702:        s[3]  = s1 >> 3;
                   2703:        s[4]  = s1 >> 11;
                   2704:        s[5]  = (s1 >> 19) | (s2 * ((uint64_t) 1 << 2));
                   2705:        s[6]  = s2 >> 6;
                   2706:        s[7]  = (s2 >> 14) | (s3 * ((uint64_t) 1 << 7));
                   2707:        s[8]  = s3 >> 1;
                   2708:        s[9]  = s3 >> 9;
                   2709:        s[10] = (s3 >> 17) | (s4 * ((uint64_t) 1 << 4));
                   2710:        s[11] = s4 >> 4;
                   2711:        s[12] = s4 >> 12;
                   2712:        s[13] = (s4 >> 20) | (s5 * ((uint64_t) 1 << 1));
                   2713:        s[14] = s5 >> 7;
                   2714:        s[15] = (s5 >> 15) | (s6 * ((uint64_t) 1 << 6));
                   2715:        s[16] = s6 >> 2;
                   2716:        s[17] = s6 >> 10;
                   2717:        s[18] = (s6 >> 18) | (s7 * ((uint64_t) 1 << 3));
                   2718:        s[19] = s7 >> 5;
                   2719:        s[20] = s7 >> 13;
                   2720:        s[21] = s8 >> 0;
                   2721:        s[22] = s8 >> 8;
                   2722:        s[23] = (s8 >> 16) | (s9 * ((uint64_t) 1 << 5));
                   2723:        s[24] = s9 >> 3;
                   2724:        s[25] = s9 >> 11;
                   2725:        s[26] = (s9 >> 19) | (s10 * ((uint64_t) 1 << 2));
                   2726:        s[27] = s10 >> 6;
                   2727:        s[28] = (s10 >> 14) | (s11 * ((uint64_t) 1 << 7));
                   2728:        s[29] = s11 >> 1;
                   2729:        s[30] = s11 >> 9;
                   2730:        s[31] = s11 >> 17;
                   2731: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>