g | x | w | all

Bytes	Lang	Time	Link
001	C++ gcc	230806T221955Z	dingledo
039	Rust	230811T093824Z	Bubbler
003	Rust + numbigint or dashu	230807T000749Z	Bubbler
003	Python 3	230805T155641Z	l4m2

C++ (gcc), ~0.01s

#include <algorithm>
#include <cmath>
#include <cstdint>
#include <cstdio>
#include <ctime>
#include <immintrin.h>

struct C {
    double x, y;
    C() : x(0), y(0) {}
    C(double x, double y) : x(x), y(y) {}
    inline C operator+ (const C &c) const { return C(x + c.x, y + c.y); }
    inline C operator- (const C &c) const { return C(x - c.x, y - c.y); }
    inline C operator* (const C &c) const { return C(x * c.x - y * c.y, x * c.y + y * c.x); }
    inline C conj() const { return C(x, -y); }
};

constexpr double PI = acos(-1);
constexpr int SIZE = 100000, B = 32 - __builtin_clz(SIZE * 2 - 1), N = 1 << B;

alignas(32) C w[N >> 2], f0[N], f1[N];

void init_roots() {
    w[0] = C(1, 0);
    for (int len = 1; len < (N >> 2); len <<= 1) {
        C wn(cos(PI / (len << 2)), sin(PI / (len << 2)));
        for (int i = len; i < len << 1; i++)
            w[i] = w[i - len] * wn;
    }
}

__m256d avx_complex_mul(__m256d a, __m256d b) {
    __m256d ax = _mm256_unpacklo_pd(a, a);
    __m256d ay = _mm256_unpackhi_pd(a, a);
    __m256d bs = _mm256_shuffle_pd(b, b, 5);
    return _mm256_fmaddsub_pd(ax, b, _mm256_mul_pd(ay, bs));
}

void fft(C *f) {
    for (int len = N >> 2; len > 1; len >>= 2) {
        for (int i = 0, m = 0; i < N; i += len << 2, m++) {
            double w1x = w[m].x, w1y = w[m].y;
            double w2x = w1x * w1x - w1y * w1y, w2y = 2 * w1x * w1y;
            double w3x = w1x * w2x - w1y * w2y, w3y = w1x * w2y + w1y * w2x;

            __m256d w1 = _mm256_setr_pd(w1x, w1y, w1x, w1y);
            __m256d w2 = _mm256_setr_pd(w2x, w2y, w2x, w2y);
            __m256d w3 = _mm256_setr_pd(w3x, w3y, w3x, w3y);

            constexpr __m256d posneg = { 0.0, -0.0, 0.0, -0.0 };

            for (int j = i; j < i + len; j += 2) {
                __m256d c0 = _mm256_load_pd(&f[j + len * 0].x);
                __m256d c1 = avx_complex_mul(_mm256_load_pd(&f[j + len * 1].x), w1);
                __m256d c2 = avx_complex_mul(_mm256_load_pd(&f[j + len * 2].x), w2);
                __m256d c3 = avx_complex_mul(_mm256_load_pd(&f[j + len * 3].x), w3);

                __m256d a02 = _mm256_add_pd(c0, c2);
                __m256d a13 = _mm256_add_pd(c1, c3);
                __m256d s02 = _mm256_sub_pd(c0, c2);
                __m256d s13 = _mm256_xor_pd(posneg, _mm256_sub_pd(c1, c3));
                s13 = _mm256_shuffle_pd(s13, s13, 5);

                _mm256_store_pd(&f[j + len * 0].x, _mm256_add_pd(a02, a13));
                _mm256_store_pd(&f[j + len * 1].x, _mm256_sub_pd(a02, a13));
                _mm256_store_pd(&f[j + len * 2].x, _mm256_add_pd(s02, s13));
                _mm256_store_pd(&f[j + len * 3].x, _mm256_sub_pd(s02, s13));
            }
        }
    }

    for (int i = 0, m = 0; i < N; i += 4, m++) {
        C w1 = w[m], w2 = w1 * w1, w3 = w1 * w2;
        C c0 = f[i + 0], c1 = f[i + 1] * w1,
          c2 = f[i + 2] * w2, c3 = f[i + 3] * w3;
        C a02 = c0 + c2, a13 = c1 + c3,
          s02 = c0 - c2, s13 = (c1 - c3) * C(0, 1);
        f[i + 0] = a02 + a13, f[i + 1] = a02 - a13;
        f[i + 2] = s02 + s13, f[i + 3] = s02 - s13;
    }
}

void ifft(C *f) {
    for (int i = 0, m = 0; i < N; i += 4, m++) {
        C w1 = w[m], w2 = w1 * w1, w3 = w1 * w2;
        C c0 = f[i + 0], c1 = f[i + 1],
          c2 = f[i + 2], c3 = f[i + 3];
        C a01 = c0 + c1, a23 = c2 + c3,
          s01 = c0 - c1, s23 = (c2 - c3) * C(0, 1);
        f[i + 0] = a01 + a23, f[i + 1] = (s01 + s23) * w1;
        f[i + 2] = (a01 - a23) * w2, f[i + 3] = (s01 - s23) * w3;
    }

    for (int len = 4; len < N; len <<= 2) {
        for (int i = 0, m = 0; i < N; i += len << 2, m++) {
            double w1x = w[m].x, w1y = w[m].y;
            double w2x = w1x * w1x - w1y * w1y, w2y = 2 * w1x * w1y;
            double w3x = w1x * w2x - w1y * w2y, w3y = w1x * w2y + w1y * w2x;

            __m256d w1 = _mm256_setr_pd(w1x, w1y, w1x, w1y);
            __m256d w2 = _mm256_setr_pd(w2x, w2y, w2x, w2y);
            __m256d w3 = _mm256_setr_pd(w3x, w3y, w3x, w3y);

            constexpr __m256d posneg = { 0.0, -0.0, 0.0, -0.0 };

            for (int j = i; j < i + len; j += 2) {
                __m256d c0 = _mm256_load_pd(&f[j + len * 0].x);
                __m256d c1 = _mm256_load_pd(&f[j + len * 1].x);
                __m256d c2 = _mm256_load_pd(&f[j + len * 2].x);
                __m256d c3 = _mm256_load_pd(&f[j + len * 3].x);

                __m256d a01 = _mm256_add_pd(c0, c1);
                __m256d a23 = _mm256_add_pd(c2, c3);
                __m256d s01 = _mm256_sub_pd(c0, c1);
                __m256d s23 = _mm256_xor_pd(posneg, _mm256_sub_pd(c2, c3));
                s23 = _mm256_shuffle_pd(s23, s23, 5);

                _mm256_store_pd(&f[j + len * 0].x, _mm256_add_pd(a01, a23));
                _mm256_store_pd(&f[j + len * 1].x, avx_complex_mul(_mm256_add_pd(s01, s23), w1));
                _mm256_store_pd(&f[j + len * 2].x, avx_complex_mul(_mm256_sub_pd(a01, a23), w2));
                _mm256_store_pd(&f[j + len * 3].x, avx_complex_mul(_mm256_sub_pd(s01, s23), w3));
            }
        }
    }
}

void convolve(uint64_t *a) {
    init_roots();
    for (int i = 0; i < SIZE; i++)
        f0[i] = C(a[i] & 4095, a[i] >> 12);
    fft(f0);

    double t0 = 4 * f0[0].x * f0[0].y, t1 = 4 * f0[1].x * f0[1].y;
    f1[0].x = 4 * f0[0].y * f0[0].y, f1[0].y = 0;
    f1[1].x = 4 * f0[1].y * f0[1].y, f1[1].y = 0;
    f0[0].x = 4 * f0[0].x * f0[0].x, f0[0].y = t0;
    f0[1].x = 4 * f0[1].x * f0[1].x, f0[1].y = t1;

    for (int i = 2, msk = 0; i < N; i += 2) {
        msk |= i >> 1;
        int j = i ^ msk;
        C c0 = f0[i] + f0[j].conj();
        C c1 = (f0[i] - f0[j].conj()) * C(0, -1);
        C c00 = c0 * c0, c01 = c0 * c1, c11 = c1 * c1;
        f0[i] = c00.conj() + c01.conj() * C(0, 1), f0[j] = c00 + c01 * C(0, 1);
        f1[i] = c11.conj(), f1[j] = c11;
    }
    ifft(f0); ifft(f1);

    for (int i = 0; i < SIZE * 2 - 1; i++) {
        uint64_t c0 = (uint64_t) (f0[i].x / (N << 2) + 0.5);
        uint64_t c1 = (uint64_t) (f0[i].y / (N << 2) + 0.5);
        uint64_t c2 = (uint64_t) (f1[i].x / (N << 2) + 0.5);
        a[i] = c0 + (c1 << 13) + (c2 << 24);
    }
}

uint64_t A[SIZE * 2 - 1];

int main() {
    for (int i = 1; i < SIZE; i++) A[i] = 1e7;
    clock_t start = clock();
    convolve(A);
    clock_t end = clock();
    uint64_t checksum = 0;
    for (int i = 0; i < SIZE * 2 - 1; i++) checksum ^= A[i];
    printf("checksum: %llu\n", checksum);
    printf("cpu time: %.3fs\n", (double) (end - start) / CLOCKS_PER_SEC);
    return 0;
}

Try it online!

Run with -O3 or -Ofast enabled for best performance. The -march=native may also be necessary to run.

Uses a standard floating-point radix-4 FFT. Each input integer is split into two 12-bit chunks to avoid precision loss. Since the input comprises of only real numbers, we can get away with doing just a single FFT by using a&4095 as the real part and a>>12 as the imaginary part. Upon converting back we still need two FFTs as the output space is twice as large.

Update:

I realized that the bit reversal was unusually time consuming, so I got rid of it using a DIF-DIT scheme. DIF takes the input in normal order and outputs in bit-reversed order, while DIT takes a bit-reversed input and outputs in normal order. Thus, they magically cancel out :)
I made another mistake in assuming that llround was relatively fast. Apparently adding +0.5 and then truncating is nearly 10x faster than llround, which led to a ~30% improvement.
Added SIMD in the FFT loop. This one didn't help that much, presumably because GCC already auto-vectorizes it.

Rust, ~0.048s locally; Rust + rayon, ~0.039s locally

The main code (sequential, independent of rayon) is as follows:

pub const P: u64 = 10232178353385766913;

pub fn mulmod(a: u64, b: u64) -> u64 {
    let x = a as u128 * b as u128;
    let y = x >> 57;
    let y1 = (y >> 35) as u64;
    let y2 = y as u64 & ((1u64 << 35) - 1);
    let y_div_71 = y1 * 483939977 + (y1 + y2) / 71;
    let sub = y_div_71 as u128 * P as u128;
    let (res, borrow) = x.overflowing_sub(sub);
    if borrow { (res as u64).wrapping_add(P) } else { res as u64 }
}

fn powmod(a: u64, p: u64) -> u64 {
    let mut cur = 1;
    let mut pow = a;
    let mut p = p;
    while p > 0 {
        if p % 2 > 0 {
            cur = mulmod(cur, pow);
        }
        p /= 2;
        pow = mulmod(pow, pow);
    }
    cur
}

fn ntt<const INV: bool>(input: &mut [u64], omega_table: &[u64], inv_p2: u64) {
    // length is a power of 2
    let len = input.len();
    let l = len.trailing_zeros() as usize;
    for i in 1..len {
        let j = i.reverse_bits() >> (64 - l);
        if i < j { input.swap(i, j); }
    }

    let mut root_pow = len / 2;
    for intvl_shift in 1..=l {
        let intvl = 1usize << intvl_shift;
        input.chunks_exact_mut(intvl).for_each(|chunk| {
            let mut root_idx = 0;
            let (left, right) = chunk.split_at_mut(intvl / 2);
            for (u, v) in left.into_iter().zip(right) {
                let u2 = *u;
                let v2 = mulmod(*v, omega_table[root_idx]);
                let (mut x, overflow) = u2.overflowing_sub(P - v2);
                if overflow { x = x.wrapping_add(P); }
                *u = x;
                let (mut y, overflow) = u2.overflowing_sub(v2);
                if overflow { y = y.wrapping_add(P); }
                *v = y;
                root_idx = root_idx + root_pow; // & len - 1;
            }
        });
        root_pow /= 2;
    }
    if INV {
        for x in input.into_iter() { *x = mulmod(*x, inv_p2); }
    }
}

fn ntt_precompute(len: usize) -> (Vec<u64>, Vec<u64>) {
    let l = len.trailing_zeros();

    let primitive_root = 3;
    let omega = powmod(primitive_root, P >> l);
    let mut omega_table = vec![1];
    // let mut omega_table_prime = vec![((1u128 << 64) / P as u128) as u64];
    for i in 1..len {
        // omega_table.push((omega_table[i-1] as u128 * omega as u128 % P as u128) as u64);
        omega_table.push(mulmod(omega_table[i-1], omega));
        // omega_table_prime.push((((omega_table[i] as u128) << 64) / P as u128) as u64);
    }
    let mut inv_p2 = 1;
    let mut inv_p2_table = vec![1];
    for _ in 0..l {
        if inv_p2 % 2 == 0 { inv_p2 /= 2; }
        else { inv_p2 = inv_p2 / 2 + P / 2 + 1; }
        inv_p2_table.push(inv_p2);
    }
    (omega_table, inv_p2_table)
}

pub fn solve_ntt(a: &[u64]) -> (Vec<u64>, u128) {
    let instant = Instant::now();

    let len_max = a.len().next_power_of_two() * 2;
    let (mut omega_table, inv_p2_table) = ntt_precompute(len_max);
    let inv_p2 = inv_p2_table[len_max.trailing_zeros() as usize];

    let mut a2 = a.to_vec();
    a2.resize(len_max, 0);
    ntt::<false>(&mut a2, &omega_table, inv_p2);
    for ax in a2.iter_mut() {
        *ax = mulmod(*ax, *ax);
    }
    omega_table[1..].reverse();
    ntt::<true>(&mut a2, &omega_table, inv_p2);
    a2.truncate((a.len() * 2).saturating_sub(1));

    let elapsed = instant.elapsed().as_micros();

    (a2, elapsed)
}

This is the version that takes ~0.048s. The code uses NTT (Number Theoretic Transform) with modulo P = 10232178353385766913 = 71 * 2^57 + 1. mulmod was the major bottleneck until I changed it from u128 multiplication/division to the specialized algorithm for this specific P.

I also made three variations using rayon to parallelize parts of code when possible, with different heuristics. These versions aren't performing well in my local (which is actually a cloud server with only 2 vCPUs) except for the first, but I believe they'll all outperform the original on better CPUs.

The entire code can be found at https://github.com/Bubbler-4/cg263748. The functions are solve_ntt, solve_ntt_par, solve_ntt_par2, and solve_ntt_par3 in lib.rs.

Run instructions:

Clone the repo
cargo run --release to see timings and check that the outputs match
cargo bench to run the benchmark using the same input (rayon-powered versions are showing very high variance on my local)

Rust + num-bigint or dashu, ~0.3 seconds locally

use num_bigint::BigUint;
use dashu::integer::UBig;
use std::time::Instant;

fn solve_num(input: &[u64]) -> (Vec<u64>, u128) {
    let mut num = Vec::with_capacity(input.len() * 2);
    for &x in input {
        num.push(x as u32);
        num.push(0);
    }
    let big = BigUint::from_slice(&num);
    let instant = Instant::now();
    let big2 = &big * &big;
    let elapsed = instant.elapsed().as_micros();
    (big2.to_u64_digits(), elapsed)
}

fn solve_dashu(input: &[u64]) -> (Vec<u64>, u128) {
    let big = UBig::from_words(input);
    let instant = Instant::now();
    let square = big.square();
    let elapsed = instant.elapsed().as_micros();
    (square.as_words().to_vec(), elapsed)
}

fn main() {
    let input = (
        vec![10_000_000u64; 100_000],
        vec![1; 5]
    ).0;

    let (result, elapsed) = solve_num(&input);
    println!("elapsed: {}.{:06}", elapsed / 1000000, elapsed % 1000000);
    eprintln!("{:?}", result);

    let (result, elapsed) = solve_dashu(&input);
    println!("elapsed: {}.{:06}", elapsed / 1000000, elapsed % 1000000);
    eprintln!("{:?}", result);
}

Instructions to run:

Create a cargo project with cargo new cg263748

Add dependencies in Cargo.toml:

[dependencies]
num-bigint = "0.4"
dashu = "0.3"

Replace the content of src/main.rs with the source above
Run with cargo run --release 2>/dev/null

Both num-bigint and dashu seem to use "naive/Karatsuba/Toom-3 combo" for inputs with different sizes. For the largest sizes, the main algorithm used is Toom-3. For an input vector of size 10^5, num-bigint takes ~0.31s and dashu takes ~0.27s.

I do have an NTT source code for personal use, but I highly doubt it will be competitive given that a term can go up to 10^19, which means I have to either use 128-bit multiplication and division in the main algorithm, or convolve twice using two (or maybe three) different modulos and combine them using Chinese Remainder Theorem.

EDIT: Apparently it is possible to do a single NTT with a prime very close to 2^64, using one of the algorithms described in this paper. e.g. one could choose p = 10485760000033554433, where 5 is a primitive root. Unfortunately the faster ones won't work because the prime must be under 2^63 for them.

EDIT 2: I'm working on the NTT implementation on the paper. I realized I need to do elementwise a * b % P anyway, so I came up with a "u128 division"-less algorithm (godbolt) for a specific P = 71 * 2^57 + 1 = 10232178353385766913. This is three times as fast as naive (a as u128 * b as u128 % P as u128) as u64:

slow mulmod             time:   [21.182 ns 21.185 ns 21.188 ns]
fast mulmod             time:   [7.0764 ns 7.1222 ns 7.1864 ns]

Python 3, 3 seconds

R = "".join("%028o"%x for x in A)
B = int(R,8)**2 | (1 << 599997*28)
t = oct(B)
G = [int(t[3+i*28:31+i*28],8) for i in range(199999)]

Try it online!

Was a golfing solution somewhere else but also mine