g | x | w | all

Bytes	Lang	Time	Link
6737	Acc!!	240803T000333Z	emanresu
1800	JavaScript Node.js	240816T210842Z	Andrew B
931	tinylisp	240821T061040Z	Andrew B
1392	Setanta	240804T060806Z	bb94
2422	Ceylon	151108T230523Z	Paŭlo Eb
nan	Python 2	151105T021914Z	Ell
1095	C GNU	151107T110750Z	feersum

Acc!!, 9450 9334 8897 6737 bytes

Count y while 1 {
Count z while y/256 {
Count x while z/y^4 {
Count w while x/352 {
1113*2^x+2^8833+69*2^292+2^262+69*2^516
Count h while h-10 {
_+(258+h*y+2^73+(882+h*24)*(2^80+0^(h/9)*(4^56+y^5))+(15*4^56+6*y^5)*0^(h/9)+257*4^72+(684743974900/20^h%20+99)*4^92)*y^(864+h*24)
}
Count h while w/320 {
_+(N-_/2^w%y)*2^w
_+(1-3302829851648/2^(_/2^w%z)%2+z*0^(_/z^14%z)-_/z^4%z^2)*z^4
Count f while 0^(_/z^4%z^2)+0^(_/2^w%z-40)^2-f {
_+(1-_/z^2%z^2)*z^2
Count g while _/z^2%z*(_/z^14%z)*(g-_/y^(_/z^14%z+1)%z) {
_+(_/y^(_/z^14%z+g+5)%y-_/z%z)*z
_+(1023*y^6/2^(_/z%z)%2+z*(_/z^3%z*10+_/z%z-48)-_/z^2%z^2)*z^2
}
_+((_/z^3%z+z/2)*y^(_/2^x%z+1)+(_/2^x-_/z^14)%z*z^14+5*2^x)*0^0^(_/z^2%z*(_/z^14%z))
_+(2+_/z^14%z*y)*y^(_/2^x%z)+(_/y^(_/z^8%z)%z-_/z%z)*z
_+(_/2^x%z-_/y^(_/z^8%z)%z^2)*y^(_/z^8%z)+_/2^x%z*y^((_/y^(_/z^8%z-4)%z-4)*0^(_/z%z)+_/z%z+5)+9*2^x
_+0^(_/z^14%z)*8^86-_/z^14%z*z^14
}
_+(y^(_/2^x%z)+(_/2^x%z-_/z^14%z)*z^14+5*2^x)*(_/z^5%z)*(_/z^4%z)
_+(_/2^w%z*y^(_/2^x%z)+2^x+y^(_/z^14%z+1))*(_/z^4%z)
_-0^(_/2^w%z-41)^2*(_/y^(_/z^8%z)%z*y^(_/z^8%z)+8^86)
Count d while _/y^(_/z^9%z+5)%z*d+0^(_/2^w%z+d) {
_+(_/y^(_/z^9%z+5)-_/z^9)%z*z^9+(64-_/z^7%z)*z^7
_+(2+y*11+2^73+2^80*(_/y^(_/z^9%z+1)%z)-_/y^(_/2^x%z)%4^72)*y^(_/2^x%z)+((_/2^x%z+9)*(z+1)-9-_/y^(_/z^7%z)%z^2)*y^(_/z^7%z)+9*2^353
Count f while _/z^7%z/64 {
_+(_/y^(_/z^7%z)%z^2-_/z^5%z^2)*z^5
_+(_/y^(_/z^5%z+1)%z^2*0^0^(_/z^5%z)-_/z%z^2)*z
_+(((_/y^(_/z%z+5)-_/z)%z*0^0^(_/z%z/64)+z*(_/z^6%z))*y^(_/z^5%z+1)-_/y^(_/z^7%z+4)%z*y^(_/z^7%z+4))*0^0^(_/z%z*0^(_/z^2%z)*(0^(_/z%z-8)^2+0^(_/y^(_/z%z+1)%z)*(_/z%z/64)))+(_/z^6%z*y^(_/z^2%z+5)-_/y^(_/z^7%z+4)%z*y^(_/z^7%z+4))*0^(0^(_/z^2%z)+_/z^2%z*(_/y^(_/z^2%z+5)%z)+(_/z%z-9)^2)+(_/z^6%z*y^(_/z^5%z+5)+(_/y^(_/z^6%z+5)-_/y^(_/z^7%z+4))%z*y^(_/z^7%z+4)-_/y^(_/z^6%z+5)%z*y^(_/z^6%z+5))*0^(_/z%z-10)^2*0^(_/z^2%z)
_+(_/y^(_/z^7%z)%z^2-_/z^5%z^2)*z^5
Count n while _/z^6%z*0^n {
_+(_/y^(_/z^6%z+1)-_/z^13)%z*z^13
Count p while 0^(1-_/y^(_/z^13%z)%y)^2*0^p {
Count e while (_/z^7%z-8*e)/72*0^(_/z^12%z)+0^e {
_+(_/y^(_/z^7%z-8*e-8)%z^2-_/z^3%z^2)*z^3
_+(_/z^3%z*0^0^(_/y^(_/z^3%z+1)%z/64*0^(_/z^4%z))-_/z^12%z)*z^12
}
_+(_/y^(_/y^(_/z^12%z+1)%z+1)%z+z*(_/y^(_/z^12%z+5)%z)-_/z%z^2)*z
_+((2+_/z%z*y+2^73+_/z^2%z*2^80)*y^(_/2^x%z)+((_/2^x%z+9)*(z+1)-9-_/z%z^2)*z+9*2^353)*0^0^(2-_/y^(_/z%z)%y)
Count e while 0^(_/z^15%z)*(e-2)+0^e {
_+(873+z*864-_/z%z^2)*z*e
Count g while _/z%z {
_+(_/y^(_/z%z+5)%z+z*(_/y^(_/z^2%z+5)%z)-_/z%z^2)*z*0^0^g
_-_/z%z*z*0^((_/y^(_/z^13%z+1)-_/y^(_/y^(_/z%z+1)%z+1))%y^(_/y^(_/z^13%z+1)%z+4))
}
_+(_/y^(_/z^2%z+1)%z-_/z^15%z)*z^15
}
}
_+(_/z^13%z-_/z^15%z)*0^(_/y^(_/z^13%z)%y)*z^15
Count p while (2-_/y^(_/z^13%z)%y)*0^p {
Count h while 0^0^(_/2^w%z)*(_/y^(_/2^w%z+5)%z)+0^h {
_+(_/y^(_/2^w%z+5)%z*0^0^h+_/z^5%z*0^h-_/2^w%z)*2^w
}
_+(2+_/z^15%z*y)*y^(_/2^x%z)+_/2^x%z*y^(_/2^w%z+5+(_/z^7%z-5)*0^(_/2^w%z))+9*2^x
_+(_/y^(_/z^6%z+5)%z-_/y^(_/z^7%z+4)%z)*y^(_/z^7%z+4)
}
_+(2^227+_/z^13%z*y^(_/z^7%z+12))*0^(2-_/y^(_/z^13%z)%y)
}
_+(_/y^(_/z^7%z)%z^2-_/z%z^2)*z
_+(_/y^(_/z%z+1)%z^2-_/z^3%z^2)*z^3
Count c while 0^(_/z^2%z+c) {
_+(_/y^(_/z^4%z+1)%z+z*(_/y^(_/y^(_/z^4%z+5)%z+1)%z)-_/z^5%z^2)*z^5
Count p while 0^(_/z^3%z-11)^2*0^p {
Write 40)*(0^(2-_/y^(_/z^5%z)%y)+0^(_/z^5%z)
_+(_/z^5-_/y^(_/2^x%z))%z*y^(_/2^x%z)
_+(_/2^x+4-_/z^8)%z*z^8
Count h while (_/z^8-_/2^x)%z {
_+(_/y^(_/z^8%z-4)%z-_/2^w%z)*2^w
Count l while (2-_/y^(_/2^w%z)%y)*0^l {
Write 41)*0^(_/2^w%z
Count m while _/2^w%z*0^m {
Count e while (_/y^(_/2^w%z)%y)*(e-_/y^(_/2^w%z+1)%z) {
Write _/y^(_/2^w%z+e+5)%y
}
Count j while (1-_/y^(_/2^w%z)%y)*0^j {
_+((_/y^(_/2^w%z+1)%z-z/2)*(_/y^(_/2^w%z+1)*2/z%2*2-1)-_/z^2%z)*z^2
Write 45)*(1-_/y^(_/2^w%z+1)*2/z%2
Count i while 10-i {
Write 48+_/z^2%z/10^(9-i)%10)*0^0^(_/z^2%z/10^(9-i)+0^(9-i)
}
}
}
_-_/y^(_/z^8%z)%z*y^(_/z^8%z)-8^86
Write 32)*0^0^(_/y^(_/z^8%z-4)%z
}
Count l while 0^(2-_/y^(_/2^w%z)%y)*0^l {
_+(_/y^(_/2^w%z+1)%z^2*z%(z^2-1)-_/y^(_/z^8%z-4)%z^2)*y^(_/z^8%z-4)
Write 40)*0^((2-_/y^(_/y^(_/z^8%z)%z)%y)*(_/y^(_/z^8%z)%z)
_+8^86
}
}
Write 10
_-_/z^16%z^2*z^16-2^227
}
_+(_/y^(_/z%z+5)%z*y^(_/z^7%z+4)+(12-_/y^(_/z%z+1)%z^2)*y^(_/z%z+1))*0^(_/z^3%z-7)^2
Count p while 4479/2^(_/z^3%z)%2*0^p {
_+(2+_/z^5%z^2*y)*y^(_/2^x%z)
_+(12/2^(_/z^3%z)%2*_/y^(_/z^5%z-7+_/z^3%z*4)+_/2^x*0^(_/z^3%z-1)^2-_/z^12)%z*z^12+9*2^x
Count r while 48/2^(_/z^3%z)%2*0^r {
_+(z/2+_/y^(_/z^5%z+1)-_/y^(_/z^6%z+1)-_/z)%z*z
_+(_/z^3%z/5*(z/2+1-_/2^63%2)+(1-_/z^3%4)*_/z%z)*y^(_/2^x%z+1)
}
Count r while 0^(6-_/z^3%z)^2*0^r {
_+(_/2^x-_/z^8)%z*z^8
Count g while _/z%z*(_/z^8%z/(_/2^x%z)) {
_-0^((_/z^5-_/z^6)%z)*(_/y^(_/z^8%z)%z^2*y^(_/z^8%z)+2^259)
Count k while (_/z^5-_/z^6)%z*0^k {
_+(0^0^(_/z^5%z*(_/z^6%z))*0^((_/y^(_/z^5%z)-_/y^(_/z^6%z))%y)-_/z%z)*z
Count l while _/z%z*0^l {
_+((0^((_/y^(_/z^5%z+1)-_/y^(_/z^6%z+1))%z)-_/z%z)*z-_/y^(_/z^8%z)%z^2*y^(_/z^8%z)-2^259)*0^(_/y^(_/z^5%z)%y)
Count m while 0^(_/y^(_/z^5%z)%y-1)^2*0^m {
_-_/y^(_/z^8%z)%z^2*y^(_/z^8%z)-2^259-0^0^((_/y^(_/z^5%z+1)-_/y^(_/z^6%z+1))%y^(_/y^(_/z^5%z+1)%z+4))*z
}
_+((_/y^(_/z^5%z+5)%z+z*(_/y^(_/z^6%z+5)%z)+z^2*(_/y^(_/z^5%z+1)%z)+z^3*(_/y^(_/z^6%z+1)%z)-_/y^(_/z^8%z)%z^4)*y^(_/z^8%z)+2^259)*0^(_/y^(_/z^5%z)%y-2)^2
}
}
_*0^0^(_/z%z)+_%y^(_/2^x%z)*0^(_/z%z)
_+(_/y^(_/z^8%z)%z^2-_/z^5%z^2)*z^5
}
_+(0^0^(_/z%z)+z/2)*y^(_/2^x%z+1)
}
_+(4352/2^(_/z^3%z)%2*_/z^5%z+112/2^(_/z^3%z)%2*_/2^x%z)*z^12+5*2^x
Count e while _/y^(_/y^(_/z^7%z)%z+1)%z/64*0^(_/y^(_/z^7%z+4)%z)+0^e {
_-_/y^(_/z^7%z)%z^2*y^(_/z^7%z)-2^227
}
Count h while _/2^w%z*(_/y^(_/2^w%z+5)%z)+0^h {
_+(_/y^(_/2^w%z+5)*0^0^h+_/y^(_/z^7%z)*0^h-_/2^w)%z*2^w
}
_+(2+_/z^12%z*y)*y^(_/2^x%z)+_/2^x%z*y^(_/2^w%z+5+(_/z^7%z-5)*0^(_/2^w%z))+9*2^x
_+(_/y^(_/y^(_/z^7%z+4)%z+5)%z-_/y^(_/z^7%z+4)%z)*y^(_/z^7%z+4)
}
Count p while 0^(_/z^3%z-9)^2*0^p {
_+(_/y^(_/y^(_/y^(_/z^4%z+5)%z+5)%z+1)-_/z^12)%z*z^12
_+(_/z^6-_/z^12)%z*0^0^(_/z^5%z*(_/y^(_/z^5%z)%y^5-2^39))^2*z^12
_+((_/y^(_/z^7%z)%z^2-_/y^(_/z^7%z-8)%z^4)*y^(_/z^7%z-8)-2^227)*0^(0^0^(_/y^(_/z^7%z-8)%z)*(_/y^(_/y^(_/z^7%z-8)%z+1)%z)-12)^2
_+(12-_/y^(_/z%z+1)%z^2)*y^(_/z%z+1)+(2+_/z^12%z*y)*y^(_/2^x%z)+_/2^x%z*y^(_/z^7%z+4)+9*2^x
}
Count s while 0^(_/z^3%z-10)^2*0^s {
Count h while _/y^(_/z^14%z+5)%z+0^h {
_+((_/y^(_/z^14%z+5)%z+z*(_/y^(_/z^15%z+5)%z))*0^0^h+3710851744617*0^h-_/z^14%z^2)*z^14
}
_+(2+_/z^5%z*y+2^73+_/z^6%z*2^80-_/y^(_/2^x%z)%4^72)*y^(_/2^x%z)+(_/2^x%z+9)*y^(_/z^15%z+5)+_/2^x%z*y^(_/z^14%z+5)+18*2^x
_+(12-_/y^(_/z%z+1)%z)*y^(_/z%z+1)-_/y^(_/z^4%z+5)%z*y^(_/z^4%z+5)
}
Count p while _/z^3%z/64*0^p {
_+((_/y^(_/z^7%z)%z^2-_/y^(_/z^7%z-16)%z^6)*y^(_/z^7%z-16)-2^228)*0^0^(0^0^(_/y^(_/z^7%z-16)%z)*(_/y^(_/y^(_/z^7%z-16)%z+1)%z)/64*0^(12-0^0^(_/y^(_/z^7%z-8)%z)*(_/y^(_/y^(_/z^7%z-8)%z+1)%z))^2)
_+(_/2^x%z+_/y^(_/z^3%z+5)%z*z)*y^(_/z^7%z+8)+3074*y^(_/2^x%z)+(_/2^x-_/z^12)%z*z^12+9*2^x+2^227
}
}
}
}
}
}
}
}
}

Try it online! -786 thanks to Mukundan314, and -1369 by building on that idea. See the bottom of this post for a further explanation.

It's done. I'm tired. This took four weeks.

On an older version of this, running the final test case in optimised mode (using a byte array instead of a massive int, see below) took 17 minutes 39 seconds and allocated 58MB of heap space (actually 486MB of RAM). I don't have the patience to run the unoptimised version, but as it has to perform arithmetic on 100-million-digit ints millions of times, I'd be surprised if it finished in under a year. Python already struggles to even allocate one of these values, let alone perform arithmetic on it.

Acc++

Writing this project purely in Acc!! would've been a nightmare, so I created a language called Acc++ that compiles to Acc!!. You can find the interpreter/compiler for it, alongside other test files, in the github repository for this project. Its main features are comments, a hacked-together macro system using the python ast module, and an optimised mode which uses a byte array instead of a massive int, speeding things up by many orders of magnitude.

This program basically treats the accumulator _, an arbitrarily large integer that's the only way to store data, as an arbitrarily long array of bytes. By dividing it by the appropriate power of 2 with _/2^(x*8)%2^32, I can fairly easily read bytes/32-bit words, and then multiply by said power of 2 again to write them with _+(v-_/2^(x*8)%2^32). I can define macros readWord, writeWord, readByte, writeByte to do these. One nice consequence of this is that, as long as these are the only ways I'm interacting with memory, I can add a flag to replace these with custom functions that interact with a byte array, speeding things up a huge amount. I ended up adding a bunch more of these memory-access builtins for various purposes.

The program memory is split into three parts: 64 bytes of register space containing 16(ish) 4-bit registers, 800 bytes of stack space and an unbounded heap. While most of the registers are general-purpose, a few have specific purposes:

hp points to just after the end of the heap, and gets increased any time something's allocated.
sp points to the top of the stack during evaluation.
sp2, when in use, points to the top of the "second stack" - an extra stack allocated on top of the heap, used for printing and checking equality.
rf stores the source code during evaluation.

I also created a few useful macros. Acc!!'s while loop syntax increments a variable starting from 0 and loops while a certain expression is true:

Count i while [expr] {
[code]
}

So I can make an if statement with:

Count i while [expr]*0^i {
    [code]
}

(0^i is 1 if i is 0 and 0 otherwise. I've used it a lot). Acc!! doesn't allow nested while loops using the same variable, so I've defined If, If2 ... If10 all with different variables to get around this. I've also defined While through While5 for similar reasons. The other utility macros I've defined are conc, which executes a sequence of statements, and mov, which is effectively a writeWord(x, readWord(y)).

You can find the Acc++ source for this project in def.acc, although be aware that it's a hacky mess and contains a ton of obsolete code. It's loosely commented, but probably not particularly readable, especially with all the golfing optimisations I've added.

Storage

Since tinylisp is a dynamically-typed language, values need to include their types. I ended up with the following format:

Integers are stored as a 0x00 byte followed by a signed 32-bit int. The integer n is stored as 2^31+n, so this supports signed ints between -2^31 and 2^31-1, which coincedentally is exactly what this challenge requires. Overflowing them will probably break things.
Strings (names) are stored as an 0x01 byte followed by a 4-byte length and the rest of the string. One nice thing about tinylisp is that strings are immutable, and furthermore all strings that'll ever be used at runtime are allocated during parsing.
Builtins are stored as register-space pointers (more on that later)
All lists are stored as linked lists - an 0x02 byte followed by a 32-bit head (first item) pointer and a 32-bit tail (rest of the list) pointer. There are several advantages to this:
- Tinylisp inherently uses linked lists - the head and tail operations correspond to the same parts of a linked list, and the cons operation constructs a list with a given head and tail pointer.
- Linked lists have to be (in most cases) immutable since one tail pointer can be pointed to by multiple head pointers. This means we never have to value about memory management, which is both a blessing and a curse - There are several places in this program where I allocate linked lists that are later impractical to clean up, and this results in slightly ridiculous memory usage. The tail of the last item of a list is a null pointer (see below) - for example, (3 5 4) is stored as [3, [5, [4, NULL]]].
- While initially I planned to use traditional arrays for this, they're extremely poorly suited to the task due to memory management being a pain. One other decision I've made is that all empty lists () are referred to by the NULL pointer - a pointer to address 0. While this has quite a few advantages, primarily making it easy to check if a value is null, it does result in some weird shenaniganry any time I want to append to a list, which thankfully isn't particularly often.

Evaluation

Since, as before, we have no access to any sort of recursion, we have to emulate a call stack. Each stack frame is a pair of (pointers to) lists: A list of evaluated arguments, and a list of unevaluated arguments. When evaluating a list (S-expression), the evaluated arguments list starts at (), and the unevaluated arguments list starts with the list. One quirk of this is, as the call stack is set up to only evaluate lists, we can't directly evaluate non-list values, but I've found ways around that.

To evaluate a value from the arguments list:

If it's an integer or NULL, it's passed as normal.
If it's a string, it's looked up, first in the local dictionary (obtained by looking upwards through the call stack until we hit a function), and then in the global dictionary. An undefined variable name will (probably) return null.
If it's a list, we push () and it onto the call stack.

The first item of a list is the function to call, and is always evaluated. After that, we check if it's a macro (q or starts with an empty list), and if so push the rest of its arguments to the evaluated stack. While i and d are also macros, they need some special handling:

i needs to have its first argument, the condition, evaluated regardless, but once that's been evaluated we push the other two arguments to the evaluated stack.
d needs to have its first argument (the name) left unevaluated, so we push that to the evaluated stack unchanged, but it does need the second argument (the value) evaluated, so we leave that alone.

One trick I've used is to have builtins point to addresses in the register space: c points to address 1, h to address 2, etc. This makes it easy to distinguish builtins from regular functions, and also makes it very easy to concisely check what a builtin is.

Also, I mentioned before that I can't directly evaluate non-list values. To get around this, I've defined two builtins that are only accessible by the interpreter itself:

print (address 11) is used at the top-level to print the result of an expression. It uses a fairly simple algorithm with the second stack to print a value. If given a builtin, it'll attempt to print the value at that index, which is undefined behaviour.
id (address 12) simply takes one argument and evaluates it. While this sounds useless, it's actually quite powerful: Certain builtins can rearrange the call stack to control what they evaluate, by replacing calls to themselves with a call to id. I'll explain this more shortly.

Once a function call has had all its arguments evaluated (/ otherwise dealt with), we check what the first argument, the function, is. If it's a builtin:

head, tail, and cons correspond to getting the head/tail of a linked list and constructing a new list respectively, all of which are operations I've already defined.
subtract and lessthan are also quite simple to implement. However, they share a lot of the same code: They both need to subtract their arguments, then s allocates a new int with that value, and l allocates a new int with whether that value is negative.
Since equal has to perform a deep equality check, it uses the "second stack" allocated on top of the heap. For performance reasons and to catch nulls, it first checks if the pointers to two values are equal, meaning that checking two equivalent builtins for equality will actually work. Checking a builtin and any other value (including another builtin) could return 1, return 0, or crash depending on the exact values of certain registers at the time.
q (and id) simply return their argument.

If one of the above is called, its return value is pushed to the evaluated args list of the previous stack frame, and the expression itself is popped from the evaluated args list. The remainder of the builtins require special handling:

d needs to not be able to redefine existing values. As dictionary lookup starts from the beginning, prepending the given name and value to the global dictionary would actually succeed, so we instead append them, and then return the given name wrapped in id.
v simply replaces itself with a call to id.
i checks if its first argument is truthy (not NULL, not the integer 0), and replaces itself with a call to id containing either the second or third argument depending on the first. For TCO reasons (see below), if it's itself the result of a call to id we squash that call.

Otherwise, the value is a function, and the eval'd stack is structured as ((names body) args). We push body to the stack wrapped in a call to id. When looking up the local dictionary, if names is a singleton list, we wrap it and args in a list to treat it as a single variable.

This makes TCO very easy - we can simply check if the item two steps back on the stack is a function call, and the item after that is a call to id, and if so we can simply remove both of those calls.

Then, to evaluate a piece of code, we simply parse it (the parser uses a simple one-pass algorithm) and evaluate+print all its expressions.

Golfing

It's difficult to portray just how much effort went into golfing this. The original compiled version was around 30KB, and I've spent over two weeks golfing it down to the 9.5KB version here. I'll list some of the tricks I used here.

Before I start, to put some sizes into context:

Reading the cheapest register, ra, is 10 bytes: _/2^32%2^32. Some of the other registers are 11 bytes due to having a 3-digit address.
Writing to ra is 22 bytes: _+(v-_/2^32%2^32)*2^32. For the same reasons as above, some registers are 24 bytes.
An if statement has an overhead of 25 bytes: Count v while (cond)*0^v {\ncode\n}

Also, because I'm using a macro system, macros that use an argument multiple times will simply have that argument placed multiple times within their code. For example, I have the following macro head_ll(x) that gets the head pointer of a linked list:

#defm head_ll(x) 0^0^x * readWord(x+1)

However, this specific macro is null-safe: The 0^0^x* at the start ensures that the macro returns 0 when given 0 (which isn't guaranteed otherwise), but this comes at the cost of having to use x twice. If an already large expression is passed to head_ll, it'll double in length, and stacking head_ll multiple times blows up the size exponentially. There are two ways around this - one way is to, where possible, use the following unsafe version that only uses x once:

#defm u_head_ll(x) readWord(x+1)

The other way is to extract expensive expressions into registers:

writeWord(ra, [expensive expression])
head_ll(rax)

First, the code generated by Acc++ isn't particularly golfed, containing a ton of whitespace. Because of this, I wrote an intermediate compressor that removes whitespace and replaces a few common patterns with shorter versions.

One important thing is that we're not limited to just 32-bit read/writes. By simply replacing 32 with 64 in read/write calls, I can read/write two words at once for almost the same byte count. When reading/writing to consecutive addresses, this both saves a ton of bytes and improves performance (since the overhead of interacting with a massive int is so large). For example, as the registers ra and rb are consecutive in memory, I can replace writeWord(ra, 5) ; writeWord(rb, 6) with write_dword(ra, 5, 6) and save a ton of bytes in the compiled result.

Perhaps unsurprisingly, though, the largest golfs came from logic optimisations. For example, I need to search through both the local and global dictionaries when looking up a variable name. Originally, this required two separate but almost-identical pieces of code to search through the dictionaries, but by looping twice, searching through the local dictionary on the first iteration and the global dictionary on the second iteration, I can remove almost half the code.

Another of these logic optimisations is initialising the global dictionary. It's stored as two linked lists: one of names and one of values. Initially, I had a ton of code to allocate linked lists, but by carefully constructing it with a for loop (see the first three lines) I've saved almost a kilobyte.

Another trick I used is something I call parallel writes. All calls involving writing memory amount to _+[x]. If I have multiple write calls in a row that don't rely on each other _+[x], _+[y], _+[z] I can merge them into _+[x]+[y]+[z], saving two bytes per use. This doesn't sound like much, but it has another advantage.

As I mentioned before, an if statement has an overhead of 25 bytes. If I have a paralellised write call _+[x]+[y]+[z] wrapped in a conditional, I can remove the if statement entirely and replace it with _+([x]+[y]+[z])*[cond]. This saves 22 bytes - 20 if cond needs to be in brackets, and 16 if cond needs to be converted to 1 or 0. (also, I randomly discovered you can make write calls conditional while messing around yesterday, which saved ~120 bytes)

One of the last things I did was switch to length-prefixed strings. With null-terminated strings, to check if two strings are equal you have to iterate through every character until you get to the end of one string. But with length-prefixed strings, I can simply read the entirety of the string as a massive int, and check if those are equal, saving several hundred bytes (since checking string equality is used twice, once in dict lookup and once in equal).

Aside from that, there have been tons of microoptimisations - removing a byte or two from conditions, simplifying a write call, shortening mathematical expressions here and there, etc. At this point, although there are definitely a few more microoptimisations I haven't found, I'd say this is fairly well-golfed, and I doubt under 9KB is possible without significantly rewriting certain parts of the program.

Four months later, I decided to do that "significant rewrite". The main part of this was replacing traditional two's-complement signed ints with just storing 2^31+x, which removed a bunch of sign logic. I also made a ton of minor arithmetic optimisations. At this point, I've left the Acc++ source alone - the sort of tricks I use here would be quite annoying to backport into it. There's definitely a fair amount of room for improvement here.

Another eight months later, Mukundan314 found a way to save a bunch of bytes on constants used in the code e.g. 256, 352, etc - by wrapping the program in several nested loops, incrementing those variables until they reach desired values, and only then running the code - effectively assigning 352 to x, 256 to y, etc, and saving a ton of bytes.

I took this further by noticing that replacing every instance of 2^32 with a single variable woudl save over a kilobyte. The only issue is that initialising a variable to 2^32 would take that many loop iterations - which takes about five minutes alone on my machine, so the above TIO link uses a patched interpreter which skips most of the loop iterations. Combined, these changes saved over two kilobytes.

Mukundan also found some extra tricks to merge some of the loops from defining the constants.

JavaScript (Node.js), 1800 bytes

b=a=>a.replace(/\(/g,' ( ').replace(/\)/g,' ) ').trim().split(/\s+/)
c=i=>i.match(/^\d+$/)?parseInt(i):i=='nil'?null:i
d=(i,l)=>l?(t=i.shift())?t=='('?[l.push(d(i,[])),d(i,l)][1]:t==')'?l:d(i,l.concat(c(t))):l.pop():d(i,[])
R=x=>d(b(x))
function D(o,b,e){b=b||[]
e=e||[]
this.o=o
this.d={}
for(i=0;i<b.length;i++)this.s(b[i],e[i])}
D.prototype.s=function(y,v){this.d[y]=v}
D.prototype.f=function(y){return y in this.d?this:this.o?this.o.f(y):null}
D.prototype.g=function(y){e=this.f(y);if(!e)throw "not found: "+y;return e.d[y]}
F=new D(null)
F.s('c',(a,b)=>{r=b?b.slice():[];r.unshift(a);return r})
F.s('h',(a)=>a?a.length?a[0]:null:null)
F.s('t',(a)=>a?a.length>1?a.slice(1):null:null)
F.s('s',(a,b)=>a-b)
F.s('l',(a,b)=>a<b?1:0)
F.s('e',(a,b)=>{q=(a,b)=>{if(Array.isArray(a)){if(a.length!=b.length)return 0
e=1
for(i=0;i<a.length;i++){e=e&&q(a[i],b[i])}
return e}else return a==b?1:0}
return q(a,b)})
G=(a,e)=>Array.isArray(a)?a.map(k=>E(k,e)):typeof a=='string'?e.g(a):a
E=(x,n)=>{while(1){if(Array.isArray(x)){if(!x.length){return x}else{if(x[0]=='v'){x=E(x[1],n)} else if(x[0]=='d'){var y=x[1]
var v=E(x[2],n)
n.s(y, v)
return y}else if(x[0]=='i'){var t=E(x[1],n)
if(t!=null && t!=false){x=x[2]}else{if(typeof x[3]=='undefined'){return null}else{x=x[3]}}} else if(x[0]=='q'){return x[1]}else{var
f=E(x[0],n)
if(typeof f=='function'){var e=G(x,n)
return e.shift().apply(null,e)}else if((f.length>1)&&(f.length<4)){
var m=f.length==3
var v=0
var b=m?f[1]:f[0]
if(!Array.isArray(b)){b=[b]
v=1}
var h=m?f[2]:f[1]
p=x.slice()
p.shift()
var g=m?p:G(p,n)
if(v){g=[g]}x=h
n=new D(F,b,g)}}}}else{return G(x,n)}}}
H=a=>a==null?'nil':Array.isArray(a)?'('+a.map(H).join` `+')':typeof a=='function'?'<function>':a  
P=x=>H(x)
X=x=>{try{R('('+x+')').map(x=>{console.log(P(E(x,F)))})}catch(e){console.log(e)}}

Try it online!

In this golfed version, the main function that executes code is called X. X takes a single argument - the tinylisp program - executes it and returns the results of execution.

The test cases use their own execute function which in turn uses R, E, P functions (Read, Evaluate, Print). The footer in the TIO link includes the test cases from the question. Expected output is "Success!".

The following is an ungolfed version:

  function READ(x){ 
  
    const tokenize = function(input) {
      return input.replace(/\(/g, ' ( ')
                  .replace(/\)/g, ' ) ')
                  .trim()
                  .split(/\s+/);
    }
     
    const categorize = function(input) {
      if (input.match(/^\d+$/)) {
        return parseInt(input);
      } else if(input == 'nil') {
        return null ;
      } else {
        return input ;
      }
    }
  
    const parenthesize = function(input, list) {
      if (list === undefined) {
        return parenthesize(input, []);
      } else {
        var token = input.shift();
        if (token === undefined) {
          return list.pop();
        } else if (token === "(") {
          list.push(parenthesize(input, []));
          return parenthesize(input, list);
        } else if (token === ")") {
          return list;
        } else {
          return parenthesize(input, list.concat(categorize(token)));
        }
      }
    }
  
    return parenthesize(tokenize(x)) 
  }
  
  function Env(outer, binds, exprs){
    
    if(typeof binds == 'undefined'){ 
      binds = []
    }
    
    if(typeof exprs == 'undefined'){ 
      exprs = []
    }
    
    this.outer = outer
    
    this.data = {}
    
    for(var i = 0; i < binds.length; i++){ 
      this.set(binds[i], exprs[i])
    }  
  }
  
  Env.prototype.set = function(symbol, value){ 
    this.data[symbol] = value
  }
  
  Env.prototype.find = function(symbol){ 
    if(symbol in this.data){ 
      return this
    }
    if(this.outer != null){ 
      return this.outer.find(symbol)  
    }
    return null  
  }
  
  Env.prototype.get = function(symbol){ 
    var env = this.find(symbol)
    if(env == null)
      throw "not found: " + symbol
    return env.data[symbol]  
  }
  
  var repl_env = new Env(null)
  
  repl_env.set('c', (a, b) =>{r=b?b.slice():[]; r.unshift(a); return r })    
  repl_env.set('h', (a) => a?a.length?a[0]:null:null)    
  repl_env.set('t', (a) => a?a.length>1?a.slice(1):null:null)    
  repl_env.set('s', (a, b) => a - b)    
  repl_env.set('l', (a, b) => a < b ? 1 : 0)
  repl_env.set('e', (a, b) => { 
    eq=(a,b)=>{ 
      if(Array.isArray(a)){ 
        if(a.length!=b.length) return 0
        e=1
        for(i=0;i<a.length;i++){ 
          e=e&&eq(a[i],b[i])    
        }
        return e 
      }
      else        
        return a==b ? 1 : 0
    }    
    return eq(a, b)
  })    
  
  function EVAL(x, env){ 
    
    function eval_ast(ast, env){ 

      if(Array.isArray(ast)){ 
        return ast.map(k => EVAL(k, env))
      } else if(typeof ast == 'string'){ 
        return env.get(ast)
      } else { 
        return ast  
      }
    }

  while(true){   
    
    if(Array.isArray(x)){ 
      if(x.length == 0){ 
        return x
      } else { 
 
          if(x[0] == 'v'){ 
            x = EVAL(x[1], env) 

          } else if(x[0] == 'd'){ 
            var symbol = x[1]
            var value = EVAL(x[2], env) 
            env.set(symbol, value)
            return symbol

          } else if(x[0] == 'i'){ 
            var test = EVAL(x[1], env)
            if(test != null && test != false){ 
              x = x[2]
            } else { 
              if(typeof x[3] == 'undefined'){ 
                return null
              } else { 
                x = x[3]
              }
            } 
    

          } else if(x[0] == 'q'){ 
            return x[1]
        
        } else { 
          var first = EVAL(x[0],env)
          
          if(typeof first == 'function'){ 
            var e = eval_ast(x, env)
            var fn = e.shift()
            return fn.apply(null, e)
          }

          else if((first.length > 1)&&(first.length < 4 )){ 

            var m = first.length == 3

            var variablearity=false 

            var binds = m ? first[1] : first[0]

            if(!Array.isArray(binds)){ 
              binds=[binds]
              variablearity=true 
            }

            var body = m ? first[2] : first[1]

            p = x.slice()
            p.shift()

            var args = m ? p : eval_ast(p, env)
            if(variablearity){ 
              args=[args]
            }

            var newEnv= new Env(repl_env, binds, args)

            x = body
            env = newEnv
            
          }
        } 
      }
    } else { 
      return eval_ast(x, env)
    }
  }

  }
  
  function PRINT(x){ 
  
    function stringify(a){ 
      if(a == null){ 
        return 'nil'
      } else if(Array.isArray(a)){ 
        return '(' + a.map(stringify).join(' ') + ')'
      } else if(typeof a == 'function') { 
        return '<function>'
      } else { 
        return a
      }   
    }  
  
    return stringify(x)
  }
  
  function exec(x){ 
    try{ 
      READ('('+x+')')
        .map(x=>{ 
          console.log(PRINT(EVAL(x, repl_env)))
        })
    }catch(e){ 
      console.log(e)
    }  
  }

Approach

The READ function is mostly copied from Mary Rose Cook's Little Lisp Interpreter. For the rest of the code, my starting point was a JS LISP interpreter that I had already built by following Make A Lisp. I had to do quite a few modifications to get it to work with the specified constraints however.

First, I had used JS anonymous function for user defined functions. I had to change that since in tinylisp we only know if there is a user defined function by inspecting the length of the list at the first position in a list. Also, I had used functions for my special forms, but that had to change to get TCO to work. In the end EVAL is one long function which contains most of the tinylisp workings except for built-in functions which remained in their own JS functions.

When golfing my solution, I replaced all variable names with single-letter variable names. I also moved a few functions out to global scope, which enabled further shortenings. If you inspect the code you will see several occurrences of the var keyword. I haven't checked all of these are necessary, but I believe most of them are. Even with TCO in place EVAL still calls itself in some places, and so some variables need to be defined locally and not globally.

In the code there is one class (prototype based), which means in turn that I have a few places where I've used function keyword rather than ES6 arrow functions. Recall that ES6 arrow functions don't define this, and hence can't be used for class methods.

Update

After thinking about it some more, I realized there is a flaw in my submission. Since this is LISP, the built-ins c, h and t should all have O(1) time complexity. What I had done is to implement S-Expressions as Arrays, and so needed to slice() my S-expression when I wanted to get the tail.

We can test this with the following tinylisp code:

(d K 1000000)

(d f 
(q((x y)
  (i
    x
    (f (s x 1) (c 1 y))  
    y
  )
)))

(d M (f K (q(1))))

(d g 
(q((x y)
  (i
    x
    (g (s x 1) (c (t M) y))  
    y
  )
)))

(d S
(q ((x)
  ()
)))

(S (g K ()) )

Try it online!

In the code, I create an S-expression with 1,000,000 elements, and then try to call t on it 1,000,000 times. It runs in about 50 seconds on TIO. This suggests to me that the reference implementation is doing things properly :)

With my JavaScript code however, I get out-of-memory error. Why? Because I am trying to create 1,000,000 copies of a 1,000,000 element expression.

So, I re-wrote my code. I include the un-golfed version here for you to look at (If I have time, I will update my golfed version too). It runs the test code in about 5 seconds on TIO.

/*
 In this version, we first test the implementation to see how fast we can perform c/h/t operations.
*/

const toSExpr = function(arr) { 
    if(Array.isArray(arr)){ 
      if(arr.length == 0){ 
        return null 
      }
      else { 
        return [ toSExpr( arr[0] ), toSExpr(arr.slice(1))]
      }
    }
    else { 
      return arr 
    }
  }

// I think this is what we need in eval_ast  
const toSExpr2 = function(arr) { 
    if(Array.isArray(arr)){ 
      if(arr.length == 0){ 
        return null 
      }
      else { 
        return [ arr[0], toSExpr2(arr.slice(1))]
      }
    }
    else { 
      return arr 
    }
  }


function READ(x){ 
  
    const tokenize = function(input) {
      return input.replace(/\(/g, ' ( ')
                  .replace(/\)/g, ' ) ')
                  .trim()
                  .split(/\s+/);
    }
     
    const categorize = function(input) {
      if (input.match(/^\d+$/)) {
        return parseInt(input);
      } else if(input == 'nil') {
        return null ;
      } else {
        return input ;
      }
    }
  
    const parenthesize = function(input, list) {
      if (list === undefined) {
        return parenthesize(input, []);
      } else {
        var token = input.shift();
        if (token === undefined) {
          return list.pop();
        } else if (token === "(") {
          list.push(parenthesize(input, []));
          return parenthesize(input, list);
        } else if (token === ")") {
          return list;
        } else {
          return parenthesize(input, list.concat(categorize(token)));
        }
      }
    }

    var sExpr = toSExpr( parenthesize(tokenize(x)) )
     
    return sExpr
  }
  
  function Env(outer, binds, exprs){
    
    if(typeof binds == 'undefined'){ 
      binds = []
    }
    
    if(typeof exprs == 'undefined'){ 
      exprs = []
    }
    
    this.outer = outer
    
    this.data = {}
    
    for(var i = 0; i < binds.length; i++){ 
      this.set(binds[i], exprs[i])
    }  
  }
  
  Env.prototype.set = function(symbol, value){ 
    this.data[symbol] = value
  }
  
  Env.prototype.find = function(symbol){ 
    if(symbol in this.data){ 
      return this
    }
    if(this.outer != null){ 
      return this.outer.find(symbol)  
    }
    return null  
  }
  
  Env.prototype.get = function(symbol){ 
    var env = this.find(symbol)
    if(env == null)
      throw "not found: " + symbol
    return env.data[symbol]  
  }
  
  var repl_env = new Env(null)
  
  repl_env.set('c', (a, b) =>[a,b])    
  repl_env.set('h', (a) => a?a.length?a[0]:null:null)    
  repl_env.set('t', (a) => a?a.length?a[1]:null:null)   
  repl_env.set('s', (a, b) => a - b)    
  repl_env.set('l', (a, b) => a < b ? 1 : 0)
  repl_env.set('e', (a, b) => { 
    eq=(a,b)=>{ 
      if(Array.isArray(a) && Array.isArray(b)){ 
        return eq(a[0], b[0]) && eq(a[1], b[1])
      }
      else if(Array.isArray(a) || Array.isArray(b)){ 
        return 0
      }
      else {
        return a==b ? 1 : 0
      } 
    }     
    return eq(a, b)
  })    

  function lenSExpr(sExpr){ 
    if(sExpr)
      return 1 + lenSExpr(sExpr[1])  
    else 
      return 0
  }

  //translate sExpr to equivalent Array
  //only one level deep 
  function toArray(s){
    var ret = []
    while(s != null){ 
      ret.push(s[0])
      s = s[1]    
    }
    return ret 
  }

  function EVAL(x, env){ 
    
    function eval_ast(ast, env){ 
      //should take an sExpr and return an SExpr
      if(Array.isArray(ast)){ 
        var t = toArray(ast)
        var results = t.map(k => EVAL(k, env))        

        return toSExpr2(results) 

      } else if(typeof ast == 'string'){ 
        var ret = env.get(ast)
        return ret 
      } else { 
        return ast  
      }
    }

  while(true){   

    if(Array.isArray(x)){ 
      if(x.length == 0){ //not used anymore x.length can't be zero
        return x 
      } else { 
 
          if(x[0] == 'v'){ 
            x = EVAL(x[1][0], env) 

          } else if(x[0] == 'd'){ 
            var symbol = x[1][0]
            var value = EVAL(x[1][1][0], env) 
            env.set(symbol, value)
            return symbol

          } else if(x[0] == 'i'){ 
            var test = EVAL(x[1][0], env)
            if(test != null && test != false){ 
              x = x[1][1][0]
            } else { 
              if(typeof x[1][1][1] == null){ 
                return null
              } else { 
                x = x[1][1][1][0]
              }
            } 
          } else if(x[0] == 'q'){ 
            return x[1][0]
        
        } else { 
          var first = EVAL(x[0],env)
          
          if(typeof first == 'function'){ 
            var e = toArray(eval_ast(x, env))
            
            var fn = e.shift()

            return fn.apply(null, e)
          }

          else if((lenSExpr(first) > 1)&&(lenSExpr(first) < 4 )){ 

            var m = lenSExpr(first) == 3

            var variablearity=false 

            var binds = m ? first[1][0] : first[0]


            if(!Array.isArray(binds)){ 
              binds=[binds]
              variablearity=true 
            }
            else{
              binds=toArray(binds)
            }

            var body = m ? first[1][1][0] : first[1][0]

            p = x[1]

            var args = m ? p : eval_ast(p, env)


            if(variablearity){ 
              args=[args]
            }
 
            var newEnv= new Env(repl_env, binds, toArray(args))

            x = body
            env = newEnv
            
          }
        } 
      }
    } else { 
      return eval_ast(x, env)
    }
  }

  }
  
  function PRINT(x){ 
  
    function stringify(a){ 
      if(a == null){ 
        return 'nil'
      } else if(Array.isArray(a)){ 
        return '(' + toArray(a).map(stringify).join(' ') + ')'
      } else if(typeof a == 'function') { 
        return '<function>'
      } else { 
        return a
      }   
    }  
  
    return stringify(x)
  }
  
  function exec(x){ 
    try{ 
      toArray(READ('('+x+')'))
        .map(x=>{ 
          console.log(PRINT(EVAL(x, repl_env)))
        })
    }catch(e){ 
      console.log(e)
    }  
  }

//------------------------------------------------------------------------------------------------------------

function do_test(t, e){ 
    function test_exec(x){ 
        return toArray(READ('('+x+')'))
            .map(y=>{ 
              //console.log('y', y)   
              return PRINT(EVAL(y, repl_env))
            }).join('\n')
   }

   actual = test_exec(t)

   if(actual == e.split('\n').map(x=>x.trim()).join('\n')){ 
     console.log('Success!')
   }
   else{ 
    console.log('Fail!')
    console.log(actual)
   }
 }


tests = `

(d K 1000000)

(d f 
(q((x y)
  (i
    x
    (f (s x 1) (c 1 y))  
    y
  )
)))

(d M (f K (q(1))))

(d g 
(q((x y)
  (i
    x
    (g (s x 1) (c (t M) y))  
    y
  )
)))

(d S
(q ((x)
  ()
)))

(S (g K ()) )

`


expected = `K
f
M
g
S
nil`

do_test(tests, expected)

Try it online!

tinylisp, 6161 1401 1060 931 bytes

(d A(q((S T)(i T(i(e S(h T))1(A S(t T)))0
(d B(q((S T)(i(e S())T(B(t S)(c(h S)T
(d C(q((S)(B S(
(d D(q((S T U)(i S(i(A(h S)(q(40 41 32 10)))(i T(i(A(h S)(q(32 10)))(D(t S)()(c(C T)U))(D(t S)()(c(h S)(c(C T)U))))(i(A(h S)(q(32 10)))(D(t S)()U)(D(t S)()(c(h S)U))))(D(t S)(c(h S)T)U))(i T(c(C T)U)U
(d E(q((S)(i S(i(l 47(h S))(i(l(h S)58)(E(t S))0)0)1
(d F(q((S T)(i T(a S(F S(s T 1)))0
(d G(q((S T)(i S(G(t S)(a(F T 10)(s(h S)48)))T
(d H(q((S)(i(E S)(G S 0)(Z(c 36 S
(d I(q((S T)(i S(i(e(h S)41)(I(t S)(c()T))(i(e(h S)40)(I(t S)(c(c(h T)(h(t T)))(t(t T))))(I(t S)(c(c(H(h S))(h T))(t T)))))T
(d J(q((S)(h(h(I S(
(d K(q((S)(J(c 41(D S()(q(40
(d L(q((S)(Z(q (32
(d M(q(S S
(d N(q((S T)(i T(c(v(M S(h T)))(N S(t T)))(
(d X(q((S)(L(N(q Q)(K S
(d Z string
(d $c c
(d $h h
(d $t t
(d $s s
(d $l l
(d $e e
(d $v v
(d $q q
(d $i i
(d $d d
(d P(q(()(S)(i(e(type S)(q Int))S(i(e(type S)(q List))(N(q P)S)(Z(t(chars S
(d Q(q((S)(disp(v(M(q P)S

Try it online!

The golfed version's TIO link includes the test cases in the footer.

Ungolfed ( also includes test cases ) :

(d in
(q((A B)
  (i
    (e B ())
    0
    (i
      (e A (h B))
      1
      (in A (t B))
    )
  )
)))

(d rev*
(q((A B)
  (i
    (e A ())
    B
    (rev* (t A) (c (h A) B))
  )
)))

(d rev
(q((A)
  (rev* A ())
)))

(d TOK
(q((L$ A B)
  (i 
    (e L$ ())
    (i 
      (e A ())
      B
      (c (rev A) B)
    )
    (i
      (in (h L$) (q(40 41 32 10)))
      (i
        (e A ())
        (i 
          (in (h L$) (q(32 10)))
          (TOK (t L$) () B)
          (TOK (t L$) () (c (h L$) B))
        )
        (i
          (in (h L$) (q(32 10)))
          (TOK (t L$) () (c (rev A) B))
          (TOK (t L$) () (c (h L$) (c (rev A) B)))
        )
      )
      (TOK (t L$) (c (h L$) A) B) 
    )
  )
)))

(d int?
(q((L$)
  (i
    (e L$ ())
    1
    (i 
      (l 47 (h L$))
      (i 
        (l (h L$) 58)
        (int? (t L$))
        0
      )
      0
    )
  )
)))

(d times
(q((X Y)
  (i
    (e Y 0)
    0
    (a X (times X (s Y 1)))
  )
)))

(d makeint
(q((L$ X)
  (i 
    (e L$ ())
    X
    (makeint (t L$) (a (times X 10) (s (h L$) 48) )) 
  ) 
)))

(d CAT
(q((L$)
  (i
    (int? L$)
    (makeint L$ 0)
    (string (c 36 L$) )
  )
)))

(d PAREN*
(q((L$ A)
  (i
    (e L$ ())
    A
    (i
      (e (h L$) 41)
      (PAREN* (t L$) (c () A))
      (i
        (e (h L$) 40)
        (PAREN* (t L$) (c (c (h A) (h (t A))) (t (t A)) ) )       
        (PAREN* (t L$) (c (c (CAT (h L$)) (h A)) (t A) ) ) 
      )
    )
  )
)))

(d PAREN
(q((L$)
  ( h( h (PAREN*  L$ () ) ) )
)))

(d READ
(q((L$)
  (PAREN (c 41 (TOK L$ () (q(40))) )  )
)))


(d DISP
(q((L$)
  (i
    (e L$ ())
    ()
    (c  
       (c (q disp) (c (c (q q) (c (h L$)  ()) )()) )
       (DISP (t L$))
    )
  )
)))

(d PRINT
(q((L$)
  (DISP L$)
)))

(d SILENT
(q((A)
  (string (q (32)))
)))


(d list
(q(args
  args
)))


(d MAP
(q((X Y)
  (i 
    (e Y ())
    ()
    (c (v (list X (h Y) )) (MAP X (t Y)))
  )
)))


(d EXEC
(q((L$)
   (SILENT (MAP (q display) (READ L$) ) ) 
)))

(d $c c)
(d $h h)
(d $t t)
(d $s s)
(d $l l)
(d $e e)
(d $v v)
(d $q q)
(d $i i)
(d $d d)

(d clean
(q(()(A)
  (i
    (e (type A) (q Int))
    A
    (i
      (e (type A) (q List))
      (MAP (q clean) A)
      (string (t (chars A)))
    )
  )
)))

(d display
(q((A)
  (disp (v (list (q clean) A)))
)))

Try it online!

Note

In what follows, I refer to the two tinylisps as tinylispI ( the implementation version where I wrote the interpreter) and tinylispT ( the target version where you write your own code). Note that the specs for each version are different.

Limitations

There is no string support in tinylispI so to execute code you need to first translate that code into a list of char codes ( you can do that with this JS script ) then pass the list to EXEC.
There seems to be a limit on the length of char codes that I can pass in a list, so you may need to break your codes into sections.

3. Reserved Words. Because of my implementation approach, the additional functions that are available in tinylispI are also available in tinylispT which means you can use them also if you want, but you can't use their names. the (help) command shows these functions along with the ones defined in the tinylispT spec.

Approach

My first step was to write a parser which take a list of char codes and transforms it into an S-expression. In the prototyping phase, I wrote code that takes this S-expression and evaluates it (using v), and prints the results (using disp). It may seem initially like that's all that's needed but it has this draw-back: names that the user uses could collide with names that are used in the interpreter implementation. I needed to fix that. I considered these 3 alternatives:

Use globals and locals to store all names and values in S-expressions. Verdict? FAIL. Why? The tests take forever to run + there are some bugs that I couldn't resolve, the code is really complex. This is by far the coolest option, but it simply doesn't work!
Since names of 100 or more are illegal, use only names that are 100 or more chars for the implementation, especially globals. This prevents name collisions. Verdict? MAYBE - draw-backs: (a) There are still some names e.g. "string" that are built-ins in tinylispI but are not in the spec for tinylispT. User could use one of those names and get an error. We would have to add a "limitation" that lists a set of reserved words.
Transform the user's code so that all names have a pre-fix (e.g. '$'). Verdict? MAYBE.

Initially I went with option #2. However, I got some feedback that this approach is invalid, for the following reasons:

Long names (> 100 chars) are not invalid, so there could be name collisions.
built-ins (like disp) for example, are exposed so the user could use them, even though there is no mention of these in the spec.

I have switched to Option #3. Now there is a clean separation between user's code and system code. All the identifiers in the user's code are prefixed with $. I exposed exactly the ones I want exposed, by defining for example:

(d $s s)

and did that for all the built-ins from the spec. The display function I wrote maps the printed-out value back to non-prefixed, thus completing the illusion that the code is working as defined in the input string.

Setanta, 1392 bytes

Takes a long time for test #6, even with the CLI version of Setanta.

gniomh(s){i:=0F:=fad@s
gniomh L(x){toradh go_teacs(x)[0]=="["}gniomh p(){nuair-a i<F&s[i]<"!" i+=1ma i>=F toradh!1t:=""ma s[i]=="("{t=[]i+=1nuair-a 1{ma s[i]==")"{i+=1toradh t}t+=[p()]nuair-a s[i]<"!" i+=1}}nuair-a i<F&s[i]>" "&s[i]!="("&s[i]!=")"{t+=s[i]i+=1}le i idir(0,fad@t)ma aimsigh@(go_liosta@"0123456789"())(t[i])<0 toradh t
toradh go_uimh(t)}G:=0gniomh e(x,l){nuair-a 1{ma!L(x){le j idir(0,2){le i idir(0,fad@l)ma l[i][0]==x toradh l[i][1]l=G}toradh x}ma x==[] toradh[]f:=e(x[0],l)ma f==!0{c:=e(x[1],l)x=x[(c!=[]&c&2)|3]}no{ma go_teacs(f)[0]=="<" toradh f(x,l)A:=f[fad@f-2]m:=[]ma f[0]!=m{ma L(A) le i idir(0,fad@A)m+=[[A[i],e(x[i+1],l)]]no {le i idir(0,fad@x-1)m+=[e(x[i+1],l)]m=[[A,m]]}}no{ma L(A) le i idir(0,fad@A)m+=[[A[i],x[i+1]]]no m=[[A,cuid@x(1,fad@x)]]}x=f[fad@f-1]l=m}}}G=[["c",gniomh(x,l){toradh [e(x[1],l)]+e(x[2],l)}],["h",gniomh(x,l){x=e(x[1],l)ma fad@x toradh x[0]toradh[]}],["t",gniomh(x,l){x=e(x[1],l)toradh cuid@x(1,fad@x)}],["s",gniomh(x,l){toradh e(x[1],l)-e(x[2],l)}],["l",gniomh(x,l){toradh e(x[1],l)<e(x[2],l)|0&1}],["e",gniomh(x,l){toradh e(x[1],l)==e(x[2],l)|0&1}],["v",gniomh(x,l){toradh e(e(x[1],l),l)}],["q",gniomh(x,l){toradh x[1]}],["i",!0],["d",gniomh(x,l){s:=x[1]le i idir(0,fad@G)ma G[i][0]==s 1/0G+=[[s,e(x[2],l)]]toradh s}]]gniomh h(x){toradh (go_teacs(x)[0]=="["&"("+nasc@(thar(h,x))(" ")+")")|x}nuair-a 1{s:=p()ma!1==s bris scriobh(h(e(s,[])))}}

try-setanta.ie link

Not-quite-golfed version:

gniomh(s){
    i:=0 >-- Position for parsing
    F:=fad@s >-- Length of `s`
    gniomh L(x){toradh go_teacs(x)[0]=="["} >-- Returns whether x is a list
    gniomh p(){ >-- Parse an expression
        nuair-a i<F&s[i]<"!" i+=1 >-- Skip whitespace
        ma i>=F toradh!1 >-- Return `bréag` if there’s no more code
        t:=""
        ma s[i]=="("{ >-- Parse list
            t=[]
            i+=1
            nuair-a 1{
                ma s[i]==")"{i+=1toradh t}
                t+=[p()]
                nuair-a s[i]<"!" i+=1
            }
        }
        >-- Parse atom
        nuair-a i<F&s[i]>" "&s[i]!="("&s[i]!=")"{t+=s[i]i+=1}
        le i idir(0,fad@t)ma aimsigh@(go_liosta@"0123456789"())(t[i])<0 toradh t
        toradh go_uimh(t)
    }
    G:=0 >-- List of globals, set later because we need `e`
    >-- Lookup name (inlined in golfed version)
    gniomh a(l,s){le j idir(0,2){le i idir(0,fad@l)ma l[i][0]==s toradh l[i][1]l=G}toradh s}
    gniomh e(x,l){ >-- Evaluate `x` with locals `l`
        nuair-a 1{ >-- Loop for TCO
            ma!L(x) toradh a(l,x) >-- Evaluate atom
            ma x==[] toradh[] >-- Evaluate empty list
            f:=e(x[0],l) >-- Evaluate function call
            ma f==!0{c:=e(x[1],l)x=x[(c!=[]&c&2)|3]} >-- Case for `i`
            no{
                ma go_teacs(f)[0]=="<" toradh f(x,l) >-- Some other builtin
                A:=f[fad@f-2] >-- User function or macro
                m:=[]
                ma f[0]!=m{ >-- If first element is nil, then `f` is a func
                    ma L(A) le i idir(0,fad@A)m+=[[A[i],e(x[i+1],l)]]
                    no {le i idir(0,fad@x-1)m+=[e(x[i+1],l)]m=[[A,m]]}
                }no{ >-- `f` is a macro
                    ma L(A) le i idir(0,fad@A)m+=[[A[i],x[i+1]]]
                    no m=[[A,cuid@x(1,fad@x)]]
                }
                x=f[fad@f-1] >-- Tail call
                l=m
            }
        }
    }
    G=[ >-- Populate list of globals
        ["c",gniomh(x,l){toradh [e(x[1],l)]+e(x[2],l)}],
        ["h",gniomh(x,l){x=e(x[1],l)ma fad@x toradh x[0]toradh[]}],
        ["t",gniomh(x,l){x=e(x[1],l)toradh cuid@x(1,fad@x)}],
        ["s",gniomh(x,l){toradh e(x[1],l)-e(x[2],l)}],
        ["l",gniomh(x,l){toradh e(x[1],l)<e(x[2],l)|0&1}],
        ["e",gniomh(x,l){toradh e(x[1],l)==e(x[2],l)|0&1}],
        ["v",gniomh(x,l){toradh e(e(x[1],l),l)}],
        ["q",gniomh(x,l){toradh x[1]}],
        ["i",!0], >-- Special case for TCO
        >-- Evaluate 1/0 to raise an error
        ["d",gniomh(x,l){s:=x[1]le i idir(0,fad@G)ma G[i][0]==s 1/0G+=[[s,e(x[2],l)]]toradh s}]
    ]
    >-- Stringify expression in desired format
    gniomh h(x){toradh (go_teacs(x)[0]=="["&"("+nasc@(thar(h,x))(" ")+")")|x}
    nuair-a 1{ >-- Parse and evaluate expressions in `s`
        s:=p()
        ma!1==s bris >-- If the result is equal to `bréag`, then break
        scriobh(h(e(s,[])))
    }
}

Ceylon, 2422 bytes

(I think this is my longest golfed program yet.)

import ceylon.language{sh=shared,va=variable,fo=formal,O=Object}import ceylon.language.meta.model{F=Function}interface X{sh fo V v(S t);sh fo G g;}class G(va Map<S,V>m)satisfies X{v(S t)=>m[t]else nV;g=>this;sh S d(S s,V v){assert(!s in m);m=map{s->v,*m};return s;}}V nV=>nothing;class LC(G c,Map<S,V>m)satisfies X{g=>c;v(S t)=>m[t]else g.v(t);}alias C=>V|Co;interface Co{sh fo C st();}interface V{sh fo C l(X c,V[]a);sh default Boolean b=>0<1;sh fo C vO(X c);sh default V vF(X c){va C v=vO(c);while(is Co n=v){v=n.st();}assert(is V r=v);return r;}}class L(sh V*i)satisfies V{vO(X c)=>if(nonempty i)then i[0].vF(c).l(c,i.rest)else this;equals(O o)=>if(is L o)then i==o.i else 1<0;b=>!i.empty;string=>"(``" ".join(i)``)";hash=>i.hash;sh actual C l(X c,V[]p){value[h,ns,x]=i.size<3then[f,i[0],i[1]]else[m,i[1],i[2]];value n=if(is L ns)then[*ns.i.narrow<S>()]else ns;assert(is S|S[]n,is V x);V[]a=h(c,p);LC lC=if(is S n)then LC(c.g,map{n->L(*a)})else LC(c.g,map(zipEntries(n,a)));return object satisfies Co{st()=>x.vO(lC);};}}class S(String n)satisfies V{vO(X c)=>c.v(this);l(X c,V[]a)=>nV;equals(O o)=>if(is S o)then n==o.n else 1<0;hash=>n.hash;string=>n;}class I(sh Integer i)satisfies V{vO(X c)=>this;l(X c,V[]a)=>nV;equals(O o)=>if(is I o)then i==o.i else 1<0;hash=>i;b=>!i.zero;string=>i.string;}V[]f(X c,V[]a)=>[for(v in a)v.vF(c)];V[]m(X c,V[]a)=>a;L c(X c,V h,L t)=>L(h,*t.i);V h(X c,L l)=>l.i[0]else L();V t(X c,L l)=>L(*l.i.rest);I s(X c,I f,I s)=>I(f.i-s.i);I l(X c,I f,I s)=>I(f.i<s.i then 1else 0);I e(X c,V v1,V v2)=>I(v1==v2then 1else 0);C v(X c,V v)=>v.vO(c);V q(X c,V a)=>a;C i(X c,V d,V t,V f)=>d.vF(c).b then t.vO(c)else f.vO(c);S d(X c,S s,V x)=>c.g.d(s,x.vF(c));class B<A>(F<C,A>nat,V[](X,V[])h=f)satisfies V given A satisfies[X,V+]{vO(X c)=>nV;string=>nat.declaration.name;l(X c,V[]a)=>nat.apply(c,*h(c,a));}{<S->V>*}b=>{S("c")->B(`c`),S("h")->B(`h`),S("t")->B(`t`),S("s")->B(`s`),S("l")->B(`l`),S("e")->B(`e`),S("v")->B(`v`),S("q")->B(`q`,m),S("i")->B(`i`,m),S("d")->B(`d`,m)};[V*]p(String inp){value ts=inp.split(" \n()".contains,1<0,1<0);va[[V*]*]s=[];va[V*]l=[];for(t in ts){if(t in" \n"){}else if(t=="("){s=[l,*s];l=[];}else if(t==")"){l=[L(*l.reversed),*(s[0]else[])];s=s.rest;}else if(exists i=parseInteger(t),i>=0){l=[I(i),*l];}else{l=[S(t),*l];}}return l.reversed;}sh void run(){va value u="";while(exists l=process.readLine()){u=u+l+"\n";}V[]e=p(u);X c=G(map(b));for(v in e){print(v.vF(c));}}

I could have golfed some bytes more, as I used some two-letter identifiers in some places, but I've run out of somewhat meaningful single letters for those. Although even this way it doesn't look like Ceylon very much ...

This is an object-oriented implementation.

We have a value interface V with implementing classes L (list – just a wrapper around a Ceylon sequential of V), S (symbol – wrapper around a string), I (integer – wrapper around a Ceylon integer) and B (builtin function or macro, a wrapper around a Ceylon function).

I use the standard Ceylon equality notation by implementing the equals method (and also the hash attribute, which is really only needed for symbols), and also the standard string attribute for output.

We have a Boolean attribute b (which is true by default, overridden in I and L to return false for empty lists), and two methods l (call, i.e. use this object as a function) and vO (evaluate one step). Both return either a value or a Continuation object which allows then evaluation for one more step, and vF (evaluate fully) loops until the result is not a continuation anymore.

A context interface allows access to variables. There are two implementations, G for the global context (which allows adding variables using the d builtin) and LC for a local context, which is active when evaluating the expression of a user function (it falls back to the global context).

Symbols evaluation accesses the context, lists (if non empty) evaluate by first evaluating their first element and then calling its call method. Call is implemented just by lists and builtins – it first evaluates the argument (if a function, not if a macro) and then does the actual interesting stuff – for builtins just what is hardcoded, for lists it creates a new local context and returns a continuation with that.

For the builtins I used a trick similar to what I used in my Shift Interpreter, which allows me to define them with the argument types they need, but call them with a generic sequence using reflection (the types will be checked at call time). This avoids type conversion/assertion hassle inside the functions/macros, but needs top-level functions so I can get their meta-model Function objects.

The p (parse) function splits the string at spaces, newlines and parentheses, then loops over the tokens and builds lists using a stack and a running list.

The interpreter (in the run method, which is the entry point) then takes this list of expressions (which are just values), evaluates each of them, and prints the result.

Below is a version with comments and run through a formatter.

An earlier version before I started golfing (and still with some misunderstandings about list evaluation) is found at my Github repository, I'll put this one there soon (so make sure to look at the first version if you want the original).

//  Tiny Lisp, tiny interpreter
//
// An interpreter for a tiny subset of Lisp, from which most of the
// rest of the language can be bootstrapped.
//
// Question:   https://codegolf.stackexchange.com/q/62886/2338
// My answer:  https://codegolf.stackexchange.com/a/63352/2338
//
import ceylon.language {
    sh=shared,
    va=variable,
    fo=formal,
    O=Object
}
import ceylon.language.meta.model {
    F=Function
}

// context

interface X {
    sh fo V v(S t);
    sh fo G g;
}
// global (mutable) context, with the buildins 
class G(va Map<S,V> m) satisfies X {
    // get entry throws error on undefined variables. 
    v(S t) => m[t] else nV;
    g => this;
    sh S d(S s, V v) {
        // error when already defined
        assert (!s in m);
        // building a new map is cheaper (code-golf wise) than having a mutable one.
        m = map { s->v, *m };
        return s;
    }
}

// This is simply a shorter way of writing "this is not an allowed operation".
// It will throw an exception when trying to access it.
// nV stands for "no value".
V nV => nothing;

// local context
class LC(G c, Map<S,V> m) satisfies X {
    g => c;
    v(S t) => m[t] else g.v(t);
    // sh actual String string => "[local: ``m``, global: ``g``]";
}

// continuation or value
alias C => V|Co;

// continuation
interface Co {
    sh fo C st();
}

// value
interface V {
    // use this as a function and call with arguments.
    // will not work for all types of stuff.
    sh fo C l(X c, V[] a);
    // check the truthiness. Defaults to true, as
    // only lists and integers can be falsy.
    sh default Boolean b => 0 < 1;
    // evaluate once (return either a value or a continuation).
    // will not work for all kinds of expression.
    sh fo C vO(X c);
    /// evaluate fully
    sh default V vF(X c) {
        va C v = vO(c);
        while (is Co n = v) {
            v = n.st();
        }
        assert (is V r = v);
        return r;
    }
}
class L(sh V* i) satisfies V {

    vO(X c) => if (nonempty i) then i[0].vF(c).l(c, i.rest) else this;
    equals(O o) => if (is L o) then i == o.i else 1 < 0;
    b => !i.empty;
    string => "(``" ".join(i)``)";
    hash => i.hash;

    sh actual C l(X c, V[] p) {
        value [h, ns, x] =
                i.size < 3
                then [f, i[0], i[1]]
                else [m, i[1], i[2]];
        // parameter names – either a single symbol, or a list of symbols.
        // If it is a list, we filter it to throw out any non-symbols.
        // Should throw an error if there are any, but this is harder.
        value n = if (is L ns) then [*ns.i.narrow<S>()] else ns;
        assert (is S|S[] n, is V x);
        V[] a = h(c, p);

        // local context
        LC lC = if (is S n) then
            LC(c.g, map { n -> L(*a) })
        else
            LC(c.g, map(zipEntries(n, a)));
        // return a continuation instead of actually
        // calling it here, to allow stack unwinding.
        return object satisfies Co {
            st() => x.vO(lC);
        };
    }
}

// symbol
class S(String n) satisfies V {
    // evaluate: resolve
    vO(X c) => c.v(this);
    // call is not allowed
    l(X c, V[] a) => nV;
    // equal if name is equal
    equals(O o) => if (is S o) then n == o.n else 1 < 0;
    hash => n.hash;
    string => n;
}

// integer
class I(sh Integer i) satisfies V {

    vO(X c) => this;
    l(X c, V[] a) => nV;
    equals(O o) => if (is I o) then i == o.i else 1 < 0;
    hash => i;
    b => !i.zero;
    string => i.string;
}

// argument handlers for functions or macros
V[] f(X c, V[] a) => [for (v in a) v.vF(c)];
V[] m(X c, V[] a) => a;

// build-in functions
// construct
L c(X c, V h, L t) => L(h, *t.i);
// head
V h(X c, L l) => l.i[0] else L();
// tail
V t(X c, L l) => L(*l.i.rest);
// subtract
I s(X c, I f, I s) => I(f.i - s.i);
// lessThan
I l(X c, I f, I s) => I(f.i < s.i then 1 else 0);
// equal
I e(X c, V v1, V v2) => I(v1 == v2 then 1 else 0);
// eval (returns potentially a continuation)
C v(X c, V v) => v.vO(c);

// build-in macros
// quote
V q(X c, V a) => a;
// if (also returns potentially a continuation)
C i(X c, V d, V t, V f) => d.vF(c).b then t.vO(c) else f.vO(c);
// define symbol in global context
S d(X c, S s, V x) => c.g.d(s, x.vF(c));

// buildin function or macro, made from a native function and an argument handler
class B<A>(F<C,A> nat, V[](X, V[]) h = f)
        satisfies V
        given A satisfies [X, V+] {
    vO(X c) => nV;
    string => nat.declaration.name;
    // this "apply" is a hack which breaks type safety ...
    // but it will be checked at runtime.
    l(X c, V[] a) => nat.apply(c, *h(c, a));
}

// define buildins
{<S->V>*} b => {
    S("c") -> B(`c`),
    S("h") -> B(`h`),
    S("t") -> B(`t`),
    S("s") -> B(`s`),
    S("l") -> B(`l`),
    S("e") -> B(`e`),
    S("v") -> B(`v`),
    S("q") -> B(`q`, m),
    S("i") -> B(`i`, m),
    S("d") -> B(`d`, m)
};

// parses a string into a list of expressions.
[V*] p(String inp) {
    // split string into tokens (retain separators, don't group them –
    // whitespace and empty strings will be sorted out later in the loop)
    value ts = inp.split(" \n()".contains, 1 < 0, 1 < 0);
    // stack of not yet finished nested lists, outer most at bottom
    va [[V*]*] s = [];
    // current list, in reverse order (because appending at the start is shorter)
    va [V*] l = [];
    for (t in ts) {
        if (t in " \n") {
            // do nothing for empty tokens
        } else if (t == "(") {
            // push the current list onto the stack, open a new list.
            s = [l, *s];
            l = [];
        } else if (t == ")") {
            // build a lisp list from the current list,
            // pop the latest list from the stack, append the created lisp list. 
            l = [L(*l.reversed), *(s[0] else [])];
            s = s.rest;
        } else if (exists i = parseInteger(t), i >= 0) {
            // append an integer to the current list.
            l = [I(i), *l];
        } else {
            // append a symbol to the current list.
            l = [S(t), *l];
        }
    }
    return l.reversed;
}

// Runs the interpreter.
// This handles input and output, calls the parser and evaluates the expressions.
sh void run() {
    va value u = "";
    while (exists l = process.readLine()) {
        u = u + l + "\n";
    }
    V[] e = p(u);
    // create global context
    X c = G(map(b));
    // iterate over the expressions, ...
    for (v in e) {
        // print("  '``v``' → ...");
        // ... evaluate each (fully) and print the result.
        print(v.vF(c));
    }
}

Python 2, 685 675 660 657 646 642 640 bytes

import sys,re
E=[]
G=zip("chtsle",[eval("lambda x,y=0:"+f)for f
in"[x]+y (x+[E])[0] x[1:] x-y +(x<y) +(x==y)".split()])
def V(e,L=E):
 while 1:
    try:return e and int("0%s"%e)
    except:A=e[1:]
    if""<e:return dict(G+L).get(e,e)
    f=V(e[0],L)
    if""<f:
     if f in"iv":t=V(A[0],L);e=(e[~bool(t)],t)[f>"u"];continue
     if"e">f:G[:]+=(A[0],V(A[1],L)),
     return A[0]
    if[]>f or f[0]:A=[V(a,L)for a in A]
    if[]>f:return f(*A)
    P,e=f[-2:];L=([(P,A)],zip(P,A))[P<""]
F=lambda x:V<x<""and"(%s)"%" ".join(map(F,x))or"%s"%x
for t in re.sub("([()])"," \\1 ",sys.stdin.read()).split():
 if")"==t:t=E.pop()
 if"("==t:E+=[],
 elif E:E[-1]+=t,
 else:print F(V(t))

Reads input from STDIN and writes output to STDOUT.

Although not strictly required, the interpreter supports nullary functions and macros, and optimizes tail calls executed through v.

Explanation

Parsing

To parse the input, we first surround each occurence of ( and ) with spaces, and split the resulting string into words; this gives us the list of tokens. We maintain an expression stack E, which is initially empty. We scan the tokens, in order:

if we encounter a (, we push an empty list at the top of the expression stack;
if we encounter a ), we pop the value at the top of the expression stack, and append it to the list that was previously below it on the stack;
otherwise, we append the current token, as a string, to the list at the top of the expression stack (we keep integers as strings at this stage, and parse them during evaluation.)

If, when processsing an ordinary token, or after popping an expression from the stack due to ), the expression stack is empty, we're at a top-level expression, and we evaluate the value we'd otherwise have appended, using V(), and print its result, formatted appropriately using F().

Evaluation

We maintain the global scope, G, as a list of key/value pairs. Initially, it contains only the builtin functions (but not the macros, and not v, which we treat as a macro), which are implemented as lambdas.

Evaluation happens inside V(), which takes the expression to evaluate, e, and the local scope, L, which is, too, a list of key/value pairs (when evaluating a top-level expression, the local scope is empty.) The guts of V() live inside an infinite loop, which is how we perform tail-call optimization (TCO), as explained later.

We process e according to its type:

if it's the empty list, or a string convertible to an int, we return it immediately (possibly after conversion to int); otherwise,
if it's a string, we look it up in a dictionary constructed from the concatenation of the global and local scopes. If we find an associated value, we return it; otherwise, e must be the name of a builtin macro (i.e. q, i, d or v), and we return it unchanged. Otherwise, if e is not a string,
e is a (nonempty) list, i.e., a function call. We evaluate the first element of the list, i.e., the function expression, by calling V() recursively (using the current local scope); we call the result f. The rest of the list, A, is the list of arguments. f can only be a string, in which case it's a builtin macro (or the function v), a lambda, in which case it's a builtin function, or a list, in which case it's a user-defined function or macro.

If f is a a string, i.e., a builtin macro, we handle it in-place. If it's the macro i or v, we evaluate its first operand, and either select the second or third operand accordingly, in the case of i, or use the result of the first operand, in the case of v; instead of evaluating the selected expression recursively, which would defeat TCO, we simply replace e with the said expression, and jump to the beginning of the loop. If f is the macro d, we append a pair, whose first element is the first operand, and whose second element is the result of evaluating the second operand, to the global scope, G, and return the first operand. Otherwise, f is the macro q, in which case we simply return its operand directly.

Othrtwise, if f is a lambda, or a list whose first element is not (), then it's a non-nullary function, not a macro, in which case we evaluate its arguments, i.e., the elements of A, and replace A with the result.

If f is a lambda, we call it, passing it the unpacked arguments in A, and return the result.

Otherwise, f is a list, i.e., a user-defined function or macro; its parameter list is the second-to-last element, and its body is the last element. Like in the case of the macros i and v, in order to perform TCO, we don't evaluate the body recursively, but rather replace e with the body and continue to the next iteration. Unlike i and v, however, we also replace the local scope, L, with the new local scope of the function. If the parameter list, P, is, in fact, a list, the new local scope is constructed by zipping the parameter list, P, with the argument list, A; otherwise, we're dealing with a variadic function, in which case the new local scope has only one element, the pair (P, A).

REPL

If you want to play with it, here's a REPL version of the interpreter. It supports redefining symbols, and importing files through either the command line arguments, or the (import <filename>) macro. To exit the interpreter, terminate the input (usually, Ctrl+D or Ctrl+Z).

try:import sys,re,readline
except:0
E=[];G=zip("chtsle",[eval("lambda x,y=0:"+f)for f
in"[x]+y (x+[E])[0] x[1:] x-y +(x<y) +(x==y)".split()])
def V(e,L=E):
 while 1:
    try:return e and int("0%s"%e)
    except:A=e[1:]
    if""<e:return dict(G+L).get(e,e)
    f=V(e[0],L)
    if""<f:
     if f in"iv":t=V(A[0],L);e=(e[~bool(t)],t)[f>"u"];continue
     if"e">f:G[:]+=(A[0],V(A[1],L)),
     elif"j">f:X(open(str(A[0])).read())
     return A[0]
    if[]>f or f[0]:A=[V(a,L)for a in A]
    if[]>f:return f(*A)
    P,e=f[-2:];L=([(P,A)],zip(P,A))[P<""]
F=lambda x:V<x<""and"(%s)"%" ".join(map(F,x))or"%s"%x
def X(s,v=0):
 for t in re.sub("([()])"," \\1 ",s).split():
    if")"==t:t=E.pop()
    if"("==t:E[:]+=[],
    elif E:E[-1]+=t,
    else:
     x=V(t)
     if v:print F(x)
for f in sys.argv[1:]:X("(g %s)"%f)
while 1:
 try:X(raw_input(">."[[]<E]*3+" "),1)
 except EOFError:break
 except KeyboardInterrupt:E=[];print
 except Exception as e:print"Error: "+e.message

And here's an example session, implementing merge sort:

>>> (d let d) (d if i) (d head h) (d tail t) (d prepend c) (d less l)
let
if
head
tail
prepend
less
>>>
>>> (let list (q (x... x...)))
list
>>> (let lambda (q (() (params body) (list params body))))
lambda
>>> (let def (q (() (name params body) (
...     v (list (q let) name (list (q lambda) params body))
... ))))
def
>>>
>>> (def else(body) body)
else
>>> (def or(x y) ( if x x y ))
or
>>> (def and(x y) ( if x y x ))
and
>>>
>>> (def front-half(L) ( front-half/impl L L ))
front-half
>>> (def front-half/impl(L M) (
...     if M (
...         prepend (head L)
...                 (front-half/impl (tail L) (tail (tail M)))
...     ) (else
...         ()
...     )
... ))
front-half/impl
>>>
>>> (def back-half(L) ( back-half/impl L L ))
back-half
>>> (def back-half/impl(L M) (
...     if M (
...         back-half/impl (tail L) (tail (tail M))
...     ) (else
...         L
...     )
... ))
back-half/impl
>>>
>>> (def merge(L M comp) (
...     if (and L M) (
...         if (comp (head M) (head L)) (
...             prepend (head M) (merge L (tail M) comp)
...         ) (else (
...             prepend (head L) (merge (tail L) M comp)
...         ))
...     ) (else (
...         or L M
...     ))
... ))
merge
>>>
>>> (def sort(L comp) (
...     if (and L (tail L)) (
...         merge (sort (front-half L) comp)
...               (sort (back-half L) comp)
...               comp
...     ) (else
...         L
...     )
... ))
sort
>>>
>>>
>>> (let my-list (list 4 7 2 5 9 1 6 10 8 3))
my-list
>>> my-list
(4 7 2 5 9 1 6 10 8 3)
>>> (sort my-list less)
(1 2 3 4 5 6 7 8 9 10)
>>> (sort my-list (lambda(x y) ( less y x )))
(10 9 8 7 6 5 4 3 2 1)

C (GNU), 1095 bytes

Much of the action takes place in the giant v function. Instead of implementing tail recursion explicitly, v is structured so that many of the calls from v to v will be handled by gcc's tail recursion optimization. There is no garbage collection.

This makes heavy use of GCC extensions, so it could only be compiled with gcc (use the command gcc -w -Os tl.c). It also uses some scanf extensions which were not available on Windows, which I usually use. The prospect of writing the parser with standard scanf was so awful that I used a Linux VM to test the program instead. Parsing without scanf character classes probably would have added 100+ bytes.

#define O(...)({o*_=malloc(32);*_=(o){__VA_ARGS__};_;})
#define P printf
#define F(I,B)({for(I;x->c;x=x->l)B;})
#define Z return
typedef struct o{struct o*n,*l,*c;int i,t;}o;E(o a,o b){Z
a.n?!strcmp(a.n,b.n):a.c?b.c&&E(*a.c,*b.c)&E(*a.l,*b.l):!b.c&a.i==b.i;}p(o*x){x->t?P("%d ",x->i):x->n?P("%s ",x->n):F(P("("),p(x->c);P(")"));}o*x,G,N;*C(o*h,o*t){Z
O(c:h,l:t);}o*v(o*x,o*e){o*W(o*l,o*e){Z
l->c?C(v(l->c,e),W(l->l,e)):&N;}o*y,*a,*f;int t;Z
x->c?y=v(x->c,e),x=x->l,t=y->i,t?9/t?a=v(x->c,e),t>7?(t>8?a->c:a->l)?:a:t>6?v(a,e):t<6?x=v(x->l->c,e),t>4?C(a,x):O(t:1,i:t>3?E(*a,*x):t>2?a->i<x->i:a->i-x->i):v((a-&N&&!a->t|a->i?x:x->l)->l->c,e):(t&1&&d(x->c->n,v(x->l->c,e)),x->c):(y->l->l->l?y=y->l:(x=W(x,e)),a=y->c,v(y->l->c,a->n?O(n:a->n,c:x,l:&G):F(f=&G,(f=O(n:a->c->n,c:x->c,l:f),a=a->l);f))):x->n?e->n?strcmp(x->n,e->n)?v(x,e->l):e->c:e:x;}d(o*n,o*x){*v(O(n:""),&G)=(o){n:n,c:x,l:O()};}*R(h){char*z,*q;Z
scanf(" %m[^ \n()]",&q)>0?h=strtoul(q,&z,10),C(*z?O(n:q):O(t:1,i:h),R()):~getchar()&1?q=R(),C(q,R()):&N;}main(i){for(;++i<12;)d(strndup("slecivthqd"+i-2,1),O(i:i));F(x=R(),p(v(x->c,&G)));}

Semi-ungolfed

typedef struct o o;
struct o {
    char* n;
    o* l, //next in this list
     * c; 
    int i,
        t;
} ;



#define O(...)({o*_=malloc(32);*_=(o){__VA_ARGS__};_;})

E(o a, o b) { //tests equality 
    return
        a.n ? !strcmp(a.n,b.n) :
        a.t ? a.i==b.i :
        a.c ? b.c && E(*a.c,*b.c)&E(*a.l,*b.l) :
        !b.c
    ;
}

#define P printf


p(o*x){
    x->t?P("%d ",x->i):x->n?P("%s ",x->n):({for(P("(");x->c;x=x->l)p(x->c);P(")");});
}


o*_,G,N; //N = nil



o*C(o*h,o*t){return O(c:h,l:t);}


/*
        2 3 4 5 6 7 8 9 10 11
        s l e c i v t h d  q
    */


o* v(o* x, o* e) { //takes list, int, or name
    o*W(o* l, o* e) { //eval each item in list
        return l->c ? C(v(l->c ,e), W(l->l, e)) : &N;
    }

    o*y,*a,*f;int t;
    return x->c ? //nonempty list = function/macro call
        y = v(x->c,e), //evals to function/macro
        x = x->l,   //list position of first arg (if it exists)
        (t=y->t)?   //builtin no., if any
             t>9 ?
              t&1 ? x->c // 11 = q
                : //10 = d
                (d(x->c,v(x->l->c,e)),x->c)
           : (a = v(x->c,e), //eval'd first arg
             t)>7 ? // t/h
                (t > 8 ? a->c : a->l) ?: a
           : t>6 ? //v
                v(a,e)
           : (x = x->l, //position of 2nd arg in list
             t)>5 ? //i
                v( (a->n||a->l||a->i|a->t>1 ? x : x->l)->c, e)
           : (x = v(x->c,e), //evaluated 2nd arg
             t)>4 ? // c
                C(a,x)
           : O(t:1,i:
                t>3 ? E(*a,*x) :  //e
                t>2 ? a->i<x->i : //l
                      a->i-x->i   //s
              )
        :
        (
            y->l->l->l ? //whether this is macro
                y = y->l :
                (x = W(x,e)),  //eval args
            a = y->c,  //a = arg list
            //a = a->n ? x=C(x, &N), C(a, &N) : a, //transform variadic style to normal
            v(y->l->c,
               a->n ? //variadic
                O(n:a->n,c:x,l:&G)
              : ({
                   for(f=&G; a->c; a=a->l,x=x->l)
                      f=O(n:a->c->n, c: x->c, l:f);
                   f;
                })
            )
        )
    :
    x->n ? // name
        e->n ?
            strcmp(x->n,e->n) ?
                v(x,e->l)
            : e->c
        : e
     : x; //int or nil
}

d(o*n,o*x){
    * v(O(n:""),&G) =
        (o){n:n->n,c:x,l:O()};
}


;
o*R(){
    char*z,*q;int h;
return scanf(" %m[^ \n()]",&q)>0?
    h=strtoul(q,&z,10),
    C(*z ? O(n:q) : O(t:1,i:h), R())
: getchar()&1?&N:(q=R(),C(q,R()));
}
main(i) {

    for(;++i<12;) d(O(n:strndup("slecivthdq"+i-2,1)),O(t:i));

    o *q;
    for(q=R(); q->c; q=q->l) p(v(q->c,&G));

}