| Bytes | Lang | Time | Link |
|---|---|---|---|
| 001 | C++ gcc | 230806T221955Z | dingledo |
| 039 | Rust | 230811T093824Z | Bubbler |
| 003 | Rust + numbigint or dashu | 230807T000749Z | Bubbler |
| 003 | Python 3 | 230805T155641Z | l4m2 |
C++ (gcc), ~0.01s
#include <algorithm>
#include <cmath>
#include <cstdint>
#include <cstdio>
#include <ctime>
#include <immintrin.h>
struct C {
double x, y;
C() : x(0), y(0) {}
C(double x, double y) : x(x), y(y) {}
inline C operator+ (const C &c) const { return C(x + c.x, y + c.y); }
inline C operator- (const C &c) const { return C(x - c.x, y - c.y); }
inline C operator* (const C &c) const { return C(x * c.x - y * c.y, x * c.y + y * c.x); }
inline C conj() const { return C(x, -y); }
};
constexpr double PI = acos(-1);
constexpr int SIZE = 100000, B = 32 - __builtin_clz(SIZE * 2 - 1), N = 1 << B;
alignas(32) C w[N >> 2], f0[N], f1[N];
void init_roots() {
w[0] = C(1, 0);
for (int len = 1; len < (N >> 2); len <<= 1) {
C wn(cos(PI / (len << 2)), sin(PI / (len << 2)));
for (int i = len; i < len << 1; i++)
w[i] = w[i - len] * wn;
}
}
__m256d avx_complex_mul(__m256d a, __m256d b) {
__m256d ax = _mm256_unpacklo_pd(a, a);
__m256d ay = _mm256_unpackhi_pd(a, a);
__m256d bs = _mm256_shuffle_pd(b, b, 5);
return _mm256_fmaddsub_pd(ax, b, _mm256_mul_pd(ay, bs));
}
void fft(C *f) {
for (int len = N >> 2; len > 1; len >>= 2) {
for (int i = 0, m = 0; i < N; i += len << 2, m++) {
double w1x = w[m].x, w1y = w[m].y;
double w2x = w1x * w1x - w1y * w1y, w2y = 2 * w1x * w1y;
double w3x = w1x * w2x - w1y * w2y, w3y = w1x * w2y + w1y * w2x;
__m256d w1 = _mm256_setr_pd(w1x, w1y, w1x, w1y);
__m256d w2 = _mm256_setr_pd(w2x, w2y, w2x, w2y);
__m256d w3 = _mm256_setr_pd(w3x, w3y, w3x, w3y);
constexpr __m256d posneg = { 0.0, -0.0, 0.0, -0.0 };
for (int j = i; j < i + len; j += 2) {
__m256d c0 = _mm256_load_pd(&f[j + len * 0].x);
__m256d c1 = avx_complex_mul(_mm256_load_pd(&f[j + len * 1].x), w1);
__m256d c2 = avx_complex_mul(_mm256_load_pd(&f[j + len * 2].x), w2);
__m256d c3 = avx_complex_mul(_mm256_load_pd(&f[j + len * 3].x), w3);
__m256d a02 = _mm256_add_pd(c0, c2);
__m256d a13 = _mm256_add_pd(c1, c3);
__m256d s02 = _mm256_sub_pd(c0, c2);
__m256d s13 = _mm256_xor_pd(posneg, _mm256_sub_pd(c1, c3));
s13 = _mm256_shuffle_pd(s13, s13, 5);
_mm256_store_pd(&f[j + len * 0].x, _mm256_add_pd(a02, a13));
_mm256_store_pd(&f[j + len * 1].x, _mm256_sub_pd(a02, a13));
_mm256_store_pd(&f[j + len * 2].x, _mm256_add_pd(s02, s13));
_mm256_store_pd(&f[j + len * 3].x, _mm256_sub_pd(s02, s13));
}
}
}
for (int i = 0, m = 0; i < N; i += 4, m++) {
C w1 = w[m], w2 = w1 * w1, w3 = w1 * w2;
C c0 = f[i + 0], c1 = f[i + 1] * w1,
c2 = f[i + 2] * w2, c3 = f[i + 3] * w3;
C a02 = c0 + c2, a13 = c1 + c3,
s02 = c0 - c2, s13 = (c1 - c3) * C(0, 1);
f[i + 0] = a02 + a13, f[i + 1] = a02 - a13;
f[i + 2] = s02 + s13, f[i + 3] = s02 - s13;
}
}
void ifft(C *f) {
for (int i = 0, m = 0; i < N; i += 4, m++) {
C w1 = w[m], w2 = w1 * w1, w3 = w1 * w2;
C c0 = f[i + 0], c1 = f[i + 1],
c2 = f[i + 2], c3 = f[i + 3];
C a01 = c0 + c1, a23 = c2 + c3,
s01 = c0 - c1, s23 = (c2 - c3) * C(0, 1);
f[i + 0] = a01 + a23, f[i + 1] = (s01 + s23) * w1;
f[i + 2] = (a01 - a23) * w2, f[i + 3] = (s01 - s23) * w3;
}
for (int len = 4; len < N; len <<= 2) {
for (int i = 0, m = 0; i < N; i += len << 2, m++) {
double w1x = w[m].x, w1y = w[m].y;
double w2x = w1x * w1x - w1y * w1y, w2y = 2 * w1x * w1y;
double w3x = w1x * w2x - w1y * w2y, w3y = w1x * w2y + w1y * w2x;
__m256d w1 = _mm256_setr_pd(w1x, w1y, w1x, w1y);
__m256d w2 = _mm256_setr_pd(w2x, w2y, w2x, w2y);
__m256d w3 = _mm256_setr_pd(w3x, w3y, w3x, w3y);
constexpr __m256d posneg = { 0.0, -0.0, 0.0, -0.0 };
for (int j = i; j < i + len; j += 2) {
__m256d c0 = _mm256_load_pd(&f[j + len * 0].x);
__m256d c1 = _mm256_load_pd(&f[j + len * 1].x);
__m256d c2 = _mm256_load_pd(&f[j + len * 2].x);
__m256d c3 = _mm256_load_pd(&f[j + len * 3].x);
__m256d a01 = _mm256_add_pd(c0, c1);
__m256d a23 = _mm256_add_pd(c2, c3);
__m256d s01 = _mm256_sub_pd(c0, c1);
__m256d s23 = _mm256_xor_pd(posneg, _mm256_sub_pd(c2, c3));
s23 = _mm256_shuffle_pd(s23, s23, 5);
_mm256_store_pd(&f[j + len * 0].x, _mm256_add_pd(a01, a23));
_mm256_store_pd(&f[j + len * 1].x, avx_complex_mul(_mm256_add_pd(s01, s23), w1));
_mm256_store_pd(&f[j + len * 2].x, avx_complex_mul(_mm256_sub_pd(a01, a23), w2));
_mm256_store_pd(&f[j + len * 3].x, avx_complex_mul(_mm256_sub_pd(s01, s23), w3));
}
}
}
}
void convolve(uint64_t *a) {
init_roots();
for (int i = 0; i < SIZE; i++)
f0[i] = C(a[i] & 4095, a[i] >> 12);
fft(f0);
double t0 = 4 * f0[0].x * f0[0].y, t1 = 4 * f0[1].x * f0[1].y;
f1[0].x = 4 * f0[0].y * f0[0].y, f1[0].y = 0;
f1[1].x = 4 * f0[1].y * f0[1].y, f1[1].y = 0;
f0[0].x = 4 * f0[0].x * f0[0].x, f0[0].y = t0;
f0[1].x = 4 * f0[1].x * f0[1].x, f0[1].y = t1;
for (int i = 2, msk = 0; i < N; i += 2) {
msk |= i >> 1;
int j = i ^ msk;
C c0 = f0[i] + f0[j].conj();
C c1 = (f0[i] - f0[j].conj()) * C(0, -1);
C c00 = c0 * c0, c01 = c0 * c1, c11 = c1 * c1;
f0[i] = c00.conj() + c01.conj() * C(0, 1), f0[j] = c00 + c01 * C(0, 1);
f1[i] = c11.conj(), f1[j] = c11;
}
ifft(f0); ifft(f1);
for (int i = 0; i < SIZE * 2 - 1; i++) {
uint64_t c0 = (uint64_t) (f0[i].x / (N << 2) + 0.5);
uint64_t c1 = (uint64_t) (f0[i].y / (N << 2) + 0.5);
uint64_t c2 = (uint64_t) (f1[i].x / (N << 2) + 0.5);
a[i] = c0 + (c1 << 13) + (c2 << 24);
}
}
uint64_t A[SIZE * 2 - 1];
int main() {
for (int i = 1; i < SIZE; i++) A[i] = 1e7;
clock_t start = clock();
convolve(A);
clock_t end = clock();
uint64_t checksum = 0;
for (int i = 0; i < SIZE * 2 - 1; i++) checksum ^= A[i];
printf("checksum: %llu\n", checksum);
printf("cpu time: %.3fs\n", (double) (end - start) / CLOCKS_PER_SEC);
return 0;
}
Run with -O3 or -Ofast enabled for best performance. The -march=native may also be necessary to run.
Uses a standard floating-point radix-4 FFT. Each input integer is split into two 12-bit chunks to avoid precision loss. Since the input comprises of only real numbers, we can get away with doing just a single FFT by using a&4095 as the real part and a>>12 as the imaginary part. Upon converting back we still need two FFTs as the output space is twice as large.
Update:
- I realized that the bit reversal was unusually time consuming, so I got rid of it using a DIF-DIT scheme. DIF takes the input in normal order and outputs in bit-reversed order, while DIT takes a bit-reversed input and outputs in normal order. Thus, they magically cancel out :)
- I made another mistake in assuming that
llroundwas relatively fast. Apparently adding+0.5and then truncating is nearly 10x faster thanllround, which led to a ~30% improvement. - Added SIMD in the FFT loop. This one didn't help that much, presumably because GCC already auto-vectorizes it.
Rust, ~0.048s locally; Rust + rayon, ~0.039s locally
The main code (sequential, independent of rayon) is as follows:
pub const P: u64 = 10232178353385766913;
pub fn mulmod(a: u64, b: u64) -> u64 {
let x = a as u128 * b as u128;
let y = x >> 57;
let y1 = (y >> 35) as u64;
let y2 = y as u64 & ((1u64 << 35) - 1);
let y_div_71 = y1 * 483939977 + (y1 + y2) / 71;
let sub = y_div_71 as u128 * P as u128;
let (res, borrow) = x.overflowing_sub(sub);
if borrow { (res as u64).wrapping_add(P) } else { res as u64 }
}
fn powmod(a: u64, p: u64) -> u64 {
let mut cur = 1;
let mut pow = a;
let mut p = p;
while p > 0 {
if p % 2 > 0 {
cur = mulmod(cur, pow);
}
p /= 2;
pow = mulmod(pow, pow);
}
cur
}
fn ntt<const INV: bool>(input: &mut [u64], omega_table: &[u64], inv_p2: u64) {
// length is a power of 2
let len = input.len();
let l = len.trailing_zeros() as usize;
for i in 1..len {
let j = i.reverse_bits() >> (64 - l);
if i < j { input.swap(i, j); }
}
let mut root_pow = len / 2;
for intvl_shift in 1..=l {
let intvl = 1usize << intvl_shift;
input.chunks_exact_mut(intvl).for_each(|chunk| {
let mut root_idx = 0;
let (left, right) = chunk.split_at_mut(intvl / 2);
for (u, v) in left.into_iter().zip(right) {
let u2 = *u;
let v2 = mulmod(*v, omega_table[root_idx]);
let (mut x, overflow) = u2.overflowing_sub(P - v2);
if overflow { x = x.wrapping_add(P); }
*u = x;
let (mut y, overflow) = u2.overflowing_sub(v2);
if overflow { y = y.wrapping_add(P); }
*v = y;
root_idx = root_idx + root_pow; // & len - 1;
}
});
root_pow /= 2;
}
if INV {
for x in input.into_iter() { *x = mulmod(*x, inv_p2); }
}
}
fn ntt_precompute(len: usize) -> (Vec<u64>, Vec<u64>) {
let l = len.trailing_zeros();
let primitive_root = 3;
let omega = powmod(primitive_root, P >> l);
let mut omega_table = vec![1];
// let mut omega_table_prime = vec![((1u128 << 64) / P as u128) as u64];
for i in 1..len {
// omega_table.push((omega_table[i-1] as u128 * omega as u128 % P as u128) as u64);
omega_table.push(mulmod(omega_table[i-1], omega));
// omega_table_prime.push((((omega_table[i] as u128) << 64) / P as u128) as u64);
}
let mut inv_p2 = 1;
let mut inv_p2_table = vec![1];
for _ in 0..l {
if inv_p2 % 2 == 0 { inv_p2 /= 2; }
else { inv_p2 = inv_p2 / 2 + P / 2 + 1; }
inv_p2_table.push(inv_p2);
}
(omega_table, inv_p2_table)
}
pub fn solve_ntt(a: &[u64]) -> (Vec<u64>, u128) {
let instant = Instant::now();
let len_max = a.len().next_power_of_two() * 2;
let (mut omega_table, inv_p2_table) = ntt_precompute(len_max);
let inv_p2 = inv_p2_table[len_max.trailing_zeros() as usize];
let mut a2 = a.to_vec();
a2.resize(len_max, 0);
ntt::<false>(&mut a2, &omega_table, inv_p2);
for ax in a2.iter_mut() {
*ax = mulmod(*ax, *ax);
}
omega_table[1..].reverse();
ntt::<true>(&mut a2, &omega_table, inv_p2);
a2.truncate((a.len() * 2).saturating_sub(1));
let elapsed = instant.elapsed().as_micros();
(a2, elapsed)
}
This is the version that takes ~0.048s. The code uses NTT (Number Theoretic Transform) with modulo P = 10232178353385766913 = 71 * 2^57 + 1. mulmod was the major bottleneck until I changed it from u128 multiplication/division to the specialized algorithm for this specific P.
I also made three variations using rayon to parallelize parts of code when possible, with different heuristics. These versions aren't performing well in my local (which is actually a cloud server with only 2 vCPUs) except for the first, but I believe they'll all outperform the original on better CPUs.
The entire code can be found at https://github.com/Bubbler-4/cg263748. The functions are solve_ntt, solve_ntt_par, solve_ntt_par2, and solve_ntt_par3 in lib.rs.
Run instructions:
- Clone the repo
cargo run --releaseto see timings and check that the outputs matchcargo benchto run the benchmark using the same input (rayon-powered versions are showing very high variance on my local)
Rust + num-bigint or dashu, ~0.3 seconds locally
use num_bigint::BigUint;
use dashu::integer::UBig;
use std::time::Instant;
fn solve_num(input: &[u64]) -> (Vec<u64>, u128) {
let mut num = Vec::with_capacity(input.len() * 2);
for &x in input {
num.push(x as u32);
num.push(0);
}
let big = BigUint::from_slice(&num);
let instant = Instant::now();
let big2 = &big * &big;
let elapsed = instant.elapsed().as_micros();
(big2.to_u64_digits(), elapsed)
}
fn solve_dashu(input: &[u64]) -> (Vec<u64>, u128) {
let big = UBig::from_words(input);
let instant = Instant::now();
let square = big.square();
let elapsed = instant.elapsed().as_micros();
(square.as_words().to_vec(), elapsed)
}
fn main() {
let input = (
vec![10_000_000u64; 100_000],
vec![1; 5]
).0;
let (result, elapsed) = solve_num(&input);
println!("elapsed: {}.{:06}", elapsed / 1000000, elapsed % 1000000);
eprintln!("{:?}", result);
let (result, elapsed) = solve_dashu(&input);
println!("elapsed: {}.{:06}", elapsed / 1000000, elapsed % 1000000);
eprintln!("{:?}", result);
}
Instructions to run:
- Create a cargo project with
cargo new cg263748 - Add dependencies in
Cargo.toml:[dependencies] num-bigint = "0.4" dashu = "0.3" - Replace the content of
src/main.rswith the source above - Run with
cargo run --release 2>/dev/null
Both num-bigint and dashu seem to use "naive/Karatsuba/Toom-3 combo" for inputs with different sizes. For the largest sizes, the main algorithm used is Toom-3. For an input vector of size 10^5, num-bigint takes ~0.31s and dashu takes ~0.27s.
I do have an NTT source code for personal use, but I highly doubt it will be competitive given that a term can go up to 10^19, which means I have to either use 128-bit multiplication and division in the main algorithm, or convolve twice using two (or maybe three) different modulos and combine them using Chinese Remainder Theorem.
EDIT: Apparently it is possible to do a single NTT with a prime very close to 2^64, using one of the algorithms described in this paper. e.g. one could choose p = 10485760000033554433, where 5 is a primitive root. Unfortunately the faster ones won't work because the prime must be under 2^63 for them.
EDIT 2: I'm working on the NTT implementation on the paper. I realized I need to do elementwise a * b % P anyway, so I came up with a "u128 division"-less algorithm (godbolt) for a specific P = 71 * 2^57 + 1 = 10232178353385766913. This is three times as fast as naive (a as u128 * b as u128 % P as u128) as u64:
slow mulmod time: [21.182 ns 21.185 ns 21.188 ns]
fast mulmod time: [7.0764 ns 7.1222 ns 7.1864 ns]
Python 3, 3 seconds
R = "".join("%028o"%x for x in A)
B = int(R,8)**2 | (1 << 599997*28)
t = oct(B)
G = [int(t[3+i*28:31+i*28],8) for i in range(199999)]
Was a golfing solution somewhere else but also mine