g | x | w | all
Bytes Lang Time Link
348C GCC240615T040436ZEasyasPi
nanPython 3140727T185916ZRay

C (GCC, 32-bit Linux -fno-signed-char -fno-common*), 237 227(353 348 chars)

c,p,o;char*r,*t,d,s,w[];i(){sprintf(w+4,"%i",o++);while(r<t){p=creat(w,4);write(p,w,4);while(r<t){s=0;while(s++<8){d+=d;if(r<t){d+=*r++;c--;}}write(p,&d,1);}}}main(){read(p,w,4);while(read(p,&d,1)>0){r=realloc(r,c+8);s=0;while(s++<8){r[c++]=d>>7;d+=d;}}while(t=memmem(r+48,c-48,"\0\0\1\1\0\0\0\1\0\1\0\0\0\0\0\1\0\1\0\1\1\0\0\1\0\0\1\0\0\1\1\0\0\1\0\1\0\0\1\1\0\1\0\1\1\0\0\1",48))i();t=r+c;i();}

For display, C escapes are used in the string, it actually uses raw 0x00/0x01 bytes which SE hates.

Base64 BZIP2 data (compressed with lbzip2 2.5)

QlpoOTFBWSZTWfB/t1oAAGbbgGAAE35kzwAKL2fciiAA6EEqfqRE2kBoAAAA0IIA0aAAABFFhhkiiKqCKiiMgqRzpMg2PLIz2z0WCPu4OfaSWiprTtRijx6pj5V2HeKRhPI1SO8CmCArVNnU2KBBGoZ7uDiE5rRc6bhAlG0XM1hCwGEkSEVEbhQgheGNdFWvmHAYn2b2WfdpCl/g+CvA1hPHwJwDhyvF4L22UqwToyiBJJW+adTQ4N4CGH43/qViAsFnp5aiwrbg17yvtsynydwtJGA6wGAZfxdyRThQkPB/t1o=

Try it online!

Time to revive an old challenge. It has my favorite number in it, it has one of my favorite algorithm categories in it, I can't resist.

Notes:

Optimizing this for bzip2 was quite interesting since larger code is sometimes better. For example, using read(a,&e,1) is normally silly compared to getchar(), but it compresses slightly better (unless I am missing something) since the function is called in the same way as write.

Similarly, it is actually not silly to put a 48 byte lookup table in the code since it compresses really well, and most strangely, using for to combine statements instead of while actually seems to compress worse, even if it allows eliminating braces and statements. I haven't tested other variable names though.

I tried a few implementations, namely bzip2, pbzip2, 7z, and lbzip2, with lbzip2 seemingly compressing the best.

I also did a brute force search on the variable names for a good compression ratio and got 227 bytes with this one specific combination (not that there isn't guaranteed to be a shorter one, though, I'm not running a truly exhaustive search because that will take forever)

Ungolfed commented code:

#define _GNU_SOURCE // ensure memmem
#include <fcntl.h>  // creat, close
#include <unistd.h> // read, write
#include <stdio.h>  // sprintf
#include <stdlib.h> // realloc
#include <string.h> // memmem

#ifdef REALLOC_HACK
// Hack to switch incremental realloc to power-of-2 realloc since the
// realloc loop can be painfully slow. With the Scudo allocator, this
// little maneuver will save you MINUTES on d2.in.
void *realloc_hack(void *buf, size_t len) {
    static size_t real_size = 16;
    if (buf == NULL || len > real_size) {
        while (len > real_size) {
            real_size *= 2;
        }
        return realloc(buf, real_size);
    }
    return buf;
}
#define realloc realloc_hack
#endif

// Variables
static int length;         // c
static int fd;             // p
static int block_num;      // o
// Golfed code assumes char is unsigned (-fno-signed-char or any non x86 arch)
static unsigned char *ptr; // r
static unsigned char *end; // t
static unsigned char temp; // d
static unsigned char i;    // s
// ungolfed code just uses [] and writes out of bounds
// This requires -fno-common on some toolchains to keep it
// from reordering the variables and breaking things.
static char filename_header[0x80]; // w

// The magic header split into bits
// The golfed code puts binary data in a string literal, inline 
static const unsigned char magic[] = {
    0,0,1,1,0,0,0,1, // 0x31
    0,1,0,0,0,0,0,1, // 0x41
    0,1,0,1,1,0,0,1, // 0x59
    0,0,1,0,0,1,1,0, // 0x26
    0,1,0,1,0,0,1,1, // 0x53
    0,1,0,1,1,0,0,1, // 0x59
};

static void write_block(void) // n
{
    // Filenames are BZhNi, where i is the block number and N is the window size
    sprintf(filename_header + 4, "%i", block_num++);
    // while and if function the same, while compresses better
    while (ptr < end) {
        // Create the file. The golfed code just uses mode 004, you will
        // need to chmod it to read it.
        fd = creat(filename_header, 0644);
        // Write the header which is the first 4 bytes of the filename
        write(fd, filename_header, 4);
        // Loop through the block
        while (ptr < end) {
            // Consolidate the bits
            for (i = 0; i < 8; i++) {
                temp += temp; // shift left
                if (ptr < end) {
                    temp += *ptr++;
                    length--;
                }
             }
             // Write to the file
             write(fd, &temp, 1);
        }
        // close the file
        // This is skipped in the golfed code, surely we won't run into the file limit :P
        close(fd);
    }
}

int main(void)
{
    // Defaults to 0 anyways
    fd = STDIN_FILENO;
    // Read the header to the same buffer as the filename
    read(fd, filename_header, 4);
    // Read the rest of the archive from stdin
    while (read(fd, &temp, 1) > 0) {
        // Grow the buffer (define REALLOC_HACK for best results)
        ptr = realloc(ptr, length + 8);
        // Split it into bits
        for (i = 0; i < 8; i++) {
            ptr[length++] = temp >> 7;
            // Shift left
            temp += temp;
        }
    }
    // Scan the string for the magic bits with memmem
    // While this doesn't require the first bits to be 0x314159265359,
    // it will always write the first block as such.
    while ((end = memmem(
                     ptr + sizeof(magic), length - sizeof(magic),
                     magic, sizeof(magic)
                  )
           ) != NULL) {
        write_block();
    }
    // Write tail block (may be empty)
    end = ptr + length;
    write_block();
}

Python 3 -- 375(534 chars)

It can process the example file d2.in in about 10 seconds, using Pypy3. With CPython 3, it will cost 60 seconds.

It's interesting and weird that some old golf tricks would make the score worse! For example, replacing <space>+<space> by + would increase the score from 375 to 384; replacing all range by R will also increase the score to 384.

Golfed

import sys
from collections import deque
def T(B):
 for i in range(0,len(B),8):
  x=0
  for j in range(min(8,len(B)-i)): x |= B[i + j] << (7-j)
  yield x
V=sys.argv
C=F=0
B=bytearray()
I=open(V[1],'rb')
H=I.read(4)
P=deque([(x>>(7-j)) & 1 for x in b'1AY&SY' for j in range(8)])
Q=deque([0]*48)
for x in I.read():
 for j in range(8):
  b=(x>>(7-j)) & 1
  B.append(b)
  Q.popleft()
  Q.append(b)
  if Q!=P: continue
  C+=1
  M,B=B[:-48],B[-48:]
  if F:F.write(bytes(T(M)))
  F=open(V[2] + str(C),'wb')
  F.write(H)
F.write(bytes(T(B)))

Base64

QlpoOTFBWSZTWSoMWQ0AAMLfgAAQYf901zlibSo/t//kMAFaCIaCQANTTTKMnlGg8oeRpPaoNDDAAAAAaAAAAABKaSGmmpplD0jQaAABo00ZNqLfKsdHBd3QgkQ2wY3RfYpMMjPpsyit5Ee6+k6E6ObzlFT2vmgbSKPkni7ZgDkcwTRiLFBmNCuEX/HtOC8KJ+Rg51SkMCiAGXZpXbvaK+DzMLvaEhAM8reCB8HFAsNYfRghS9+i5/IpLrDzGZVhsbVOpQzHIIVZ7zCCgDkcElOMvTuvfw2bruKz2WQuvEg99gDa8T3Kw2GdyMNjEVFVVQj4sU1MaOTUcwQROwt86LfUZAOJE6HqnB60CFO38NJmpifx+trcQ8x5alqRiEGoKC2wJpPkCXpT7fbtFDbLCLQaplK23qMTC5gqGwCghjN3zEtwyKhCz3kixqBr7gya6yPG03rMOGY9hlmV3tS0W/SdC2taUpCoNSEaEnUXckU4UJAqDFkN