February 26, 2019

AOT-compilation of Javascript with V8

tldr; I'm working on a AOT-compiled Javascript implementation called jsc.

Many dynamically typed programming languages have implementations that compile to native binaries:

The benefits of compiling dynamically typed languages are similar to those of compiling statically typed languages:

  • Simplified deployment via a single binary
  • Simplified foreign-function interfaces
  • Predictable performance compared to JIT compiling interpreters
  • Performance gains compared to non-JIT compiling interpreters

I (re)discovered a common technique for compiling dynamic languages while developing BSDScheme, an interpreter and compiler for Scheme. In this technique, you use core parts of the runtime code as a library that is imported and referenced by compiled code.

You save time building object-memory representations, memory management, operations, interacting with existing libraries, etc. when an interpreter already exists. The runtime as a library (plus existing parser frontends) allows you to focus solely on code generation of control flow.

The first pass

I wrote the initial version of jsc in Rust using Dave Herman's esprit parser (supports a subset of ES6 that includes all of ES5).

The interesting parts of the runtime are taken care of by V8, e.g.:

  • V8::String - a Javascript string object
    • V8::String::NewFromUtf8(isolate, "hello world!") - C++ string to Javascript string object
  • V8::Number - a Javascript number object
    • V8::Number::New(isolate, 10) - C++ double to Javascript number object
  • Heap allocations
  • Calling convention

And so on.

An example

This first version of jsc could take the following Javascript:

function fib(n, a, b) {
    if (n == 0) {
        return a;
    }

    if (n == 1) {
        return b;
    }

    return fib(n - 1, b, a + b);
}

function main() {
  console.log(fib(50, 0, 1));
}

And produce the following C++:

#include <string>
#include <iostream>

#include <node.h>

using v8::Array;
using v8::Boolean;
using v8::Context;
using v8::Exception;
using v8::Function;
using v8::FunctionTemplate;
using v8::FunctionCallbackInfo;
using v8::Isolate;
using v8::Local;
using v8::Null;
using v8::Number;
using v8::Object;
using v8::String;
using v8::False;
using v8::True;
using v8::Value;

void fib_0(const FunctionCallbackInfo<Value>& args) {
  Isolate* isolate = args.GetIsolate();
  Local<Value> n_1 = args[0];
  Local<Value> a_2 = args[1];
  Local<Value> b_3 = args[2];
tail_recurse_4:

  Local<Context> ctx_5 = isolate->GetCurrentContext();
  Local<Object> global_6 = ctx_5->Global();
  Local<Function> Boolean_7 = Local<Function>::Cast(global_6->Get(String::NewFromUtf8(isolate, "Boolean")));
  String::Utf8Value utf8value_tmp_8(n_1);
  std::string string_tmp_9(*utf8value_tmp_8);
  String::Utf8Value utf8value_tmp_10(Number::New(isolate, 0));
  std::string string_tmp_11(*utf8value_tmp_10);
  Local<Value> argv_12[] = { (n_1->IsBoolean() || Number::New(isolate, 0)->IsBoolean()) ? Boolean::New(isolate, n_1->ToBoolean(isolate)->Value() == Number::New(isolate, 0)->ToBoolean(isolate)->Value()) : ((n_1->IsNumber() || Number::New(isolate, 0)->IsNumber()) ? Boolean::New(isolate, n_1->ToNumber(isolate)->Value() == Number::New(isolate, 0)->ToNumber(isolate)->Value()) : ((n_1->IsString() || Number::New(isolate, 0)->IsString()) ? Boolean::New(isolate, string_tmp_9 == string_tmp_11) : (False(isolate)))) };
  Local<Value> result_13 = Boolean_7->Call(Null(isolate), 1, argv_12);
  if (result_13->ToBoolean()->Value()) {
    // return a;
    args.GetReturnValue().Set(a_2);
    return;


  }

  Local<Context> ctx_14 = isolate->GetCurrentContext();
  Local<Object> global_15 = ctx_14->Global();
  Local<Function> Boolean_16 = Local<Function>::Cast(global_15->Get(String::NewFromUtf8(isolate, "Boolean")));
  String::Utf8Value utf8value_tmp_17(n_1);
  std::string string_tmp_18(*utf8value_tmp_17);
  String::Utf8Value utf8value_tmp_19(Number::New(isolate, 1));
  std::string string_tmp_20(*utf8value_tmp_19);
  Local<Value> argv_21[] = { (n_1->IsBoolean() || Number::New(isolate, 1)->IsBoolean()) ? Boolean::New(isolate, n_1->ToBoolean(isolate)->Value() == Number::New(isolate, 1)->ToBoolean(isolate)->Value()) : ((n_1->IsNumber() || Number::New(isolate, 1)->IsNumber()) ? Boolean::New(isolate, n_1->ToNumber(isolate)->Value() == Number::New(isolate, 1)->ToNumber(isolate)->Value()) : ((n_1->IsString() || Number::New(isolate, 1)->IsString()) ? Boolean::New(isolate, string_tmp_18 == string_tmp_20) : (False(isolate)))) };
  Local<Value> result_22 = Boolean_16->Call(Null(isolate), 1, argv_21);
  if (result_22->ToBoolean()->Value()) {
    // return b;
    args.GetReturnValue().Set(b_3);
    return;


  }

  // return fib(n - 1, b, a + b);
  Local<Value> arg_23 = (n_1->IsNumber() || Number::New(isolate, 1)->IsNumber()) ? (Number::New(isolate, n_1->ToNumber(isolate)->Value() - Number::New(isolate, 1)->ToNumber(isolate)->Value())) : Local<Number>::Cast(Null(isolate));
  Local<Value> arg_24 = b_3;
  Local<Value> arg_25 = (a_2->IsString() || b_3->IsString()) ? Local<Value>::Cast(String::Concat(a_2->ToString(), b_3->ToString())) : Local<Value>::Cast((a_2->IsNumber() || b_3->IsNumber()) ? (Number::New(isolate, a_2->ToNumber(isolate)->Value() + b_3->ToNumber(isolate)->Value())) : Local<Number>::Cast(Null(isolate)));
  Local<FunctionTemplate> ftpl_27 = FunctionTemplate::New(isolate, fib_0);
  Local<Function> fn_26 = ftpl_27->GetFunction();
  fn_26->SetName(String::NewFromUtf8(isolate, "fib_0"));
  n_1 = arg_23;
  a_2 = arg_24;
  b_3 = arg_25;
  goto tail_recurse_4;

}

void jsc_main(const FunctionCallbackInfo<Value>& args) {
  Isolate* isolate = args.GetIsolate();
tail_recurse_5:

  // console.log(fib(50, 0, 1))
  Local<Value> dot_parent_7 = isolate->GetCurrentContext()->Global()->Get(String::NewFromUtf8(isolate, "console"));
  Local<String> property_8 = String::NewFromUtf8(isolate, "log");
  while (dot_parent_7->IsObject() && !dot_parent_7.As<Object>()->HasOwnProperty(isolate->GetCurrentContext(), property_8).ToChecked()) {
    dot_parent_7 = dot_parent_7.As<Object>()->GetPrototype();
  }
  Local<Value> dot_result_6 = dot_parent_7.As<Object>()->Get(isolate->GetCurrentContext(), property_8).ToLocalChecked();
  Local<Value> arg_9 = Number::New(isolate, 50);
  Local<Value> arg_10 = Number::New(isolate, 0);
  Local<Value> arg_11 = Number::New(isolate, 1);
  Local<FunctionTemplate> ftpl_13 = FunctionTemplate::New(isolate, fib_0);
  Local<Function> fn_12 = ftpl_13->GetFunction();
  fn_12->SetName(String::NewFromUtf8(isolate, "fib_0"));
  Local<Value> argv_14[] = { arg_9, arg_10, arg_11 };
  Local<Value> result_15 = fn_12->Call(Null(isolate), 3, argv_14);
  Local<Value> arg_16 = result_15;
  Local<Function> fn_17 = Local<Function>::Cast(dot_result_6);
  Local<Value> argv_18[] = { arg_16 };
  Local<Value> result_19 = fn_17->Call(dot_parent_7, 1, argv_18);
  result_19;

}

void Init(Local<Object> exports) {
  NODE_SET_METHOD(exports, "jsc_main", jsc_main);
}

NODE_MODULE(NODE_GYP_MODULE_NAME, Init)

This output gets compiled (by jsc) as a Node addon using node-gyp.

The compiled addon is loaded by a single-line Javascript file generated by jsc:

$ rm -rf build
$ jsc fib.js
$ cat build/fib.js
require("build/Release/fib.node").jsc_main()
$ node build/fib.js
12586269025

Analysis

The code was a mess of bad formatting, unnecessary locals, inefficient basic operations (e.g. huge, often unnecessary Boolean conversions), and so on. The unnecessary locals was partially a by-product of single-pass code generation. And the unnecessary conversions was partly due to ignoring types (even types of literals that you don't need Typescript/Flow to provide).

After I got this proof-of-concept working for basic examples, I wanted to rewrite it around destination-driven code generation, a technique by Kent Dybvig used in V8's baseline compiler. And after a few weeks not getting far in a refactor in Rust, I rewrote the compiler in Typescript.

The second pass

Written in Typescript and using the Typescript compiler API, this second iteration was built to do destination-driven code generation and leaf type propagation. Destination-driven code generation allows a single-pass code generator to reduce redundant reassignments. And leaf type propagation allows simple, obvious optimizations such as just calling V8::Boolean::IsTrue() on a statically-known boolean rather than calling V8::Value::Equals().

Example

Given the same fibonacci Javascript program from before, this iteration produces the following C++:

#include "lib.cc"

void tco_fib(const FunctionCallbackInfo<Value>& _args) {
  Isolate* isolate = _args.GetIsolate();
  std::vector<Local<Value>> args(_args.Length());;
  for (int i = 0; i < _args.Length(); i++) args[i] = _args[i];
tail_recurse_0:

  Local<Number> sym_rhs_4 = Number::New(isolate, 0);
  Local<Boolean> sym_anon_2 = args[0]->StrictEquals(sym_rhs_4) ? True(isolate) : False(isolate);
  if (sym_anon_2->IsTrue()) {
    _args.GetReturnValue().Set(args[1]);
    return;
  }

  Local<Number> sym_rhs_11 = Number::New(isolate, 1);
  Local<Boolean> sym_anon_9 = args[0]->StrictEquals(sym_rhs_11) ? True(isolate) : False(isolate);
  if (sym_anon_9->IsTrue()) {
    _args.GetReturnValue().Set(args[2]);
    return;
  }

  Local<Number> sym_rhs_19 = Number::New(isolate, 1);
  Local<Value> sym_arg_17 = genericMinus(isolate, args[0], sym_rhs_19);
  Local<Value> sym_arg_21 = genericPlus(isolate, args[1], args[2]);
  args[0] = sym_arg_17;
  args[1] = args[2];
  args[2] = sym_arg_21;
  goto tail_recurse_0;

  return;
}

void jsc_main(const FunctionCallbackInfo<Value>& _args) {
  Isolate* isolate = _args.GetIsolate();
  std::vector<Local<Value>> args(_args.Length());;
  for (int i = 0; i < _args.Length(); i++) args[i] = _args[i];
tail_recurse_1:

  Local<Number> sym_arg_29 = Number::New(isolate, 100);
  Local<Number> sym_arg_30 = Number::New(isolate, 0);
  Local<Number> sym_arg_31 = Number::New(isolate, 1);
  Local<Value> sym_args_32[] = { sym_arg_29, sym_arg_30, sym_arg_31 };
  Local<Function> sym_fn_33 = FunctionTemplate::New(isolate, tco_fib)->GetFunction();
  sym_fn_33->SetName(String::NewFromUtf8(isolate, "tco_fib"));
  Local<Value> sym_arg_28 = sym_fn_33->Call(sym_fn_33, 3, sym_args_32);

  Local<Value> sym_args_34[] = { sym_arg_28 };
  Local<Value> sym_parent_37 = isolate->GetCurrentContext()->Global()->Get(String::NewFromUtf8(isolate, "console"));
  Local<Value> sym_anon_36 = sym_parent_37.As<Object>()->Get(String::NewFromUtf8(isolate, "log"));
  Local<Function> sym_fn_35 = Local<Function>::Cast(sym_anon_36);
  Local<Value> sym_anon_27 = sym_fn_35->Call(sym_fn_35, 1, sym_args_34);

  return;
}

void Init(Local<Object> exports) {
  NODE_SET_METHOD(exports, "jsc_main", jsc_main);
}

NODE_MODULE(NODE_GYP_MODULE_NAME, Init)

Analysis

Common code (genericPlus, genericMinus) and all imports have been pulled into lib.cc for clarity. And the entire result is run through clang-format if it is present on the system.

The benefit of leaf type propagation can be seen everywhere a local is declared that is not Local and specifically in if tests on statically known booleans:

...
Local<Boolean> sym_anon_2 = args[0]->StrictEquals(sym_rhs_4) ? True(isolate) : False(isolate);
  if (sym_anon_2->IsTrue()) {
...

It's obvious to a human that there is another optimization you could do here by not wrapping this check in a V8::Boolean at all. The only types tracked in destinations are V8 types, not yet C++ types. But not needing to passing this through a bool toBoolean(Value v) wrapper is still an improvement.

In general, unboxing has not really been explore. But the ultimate goal is to use Typescript types to produce function- or block-level unboxed versions -- perhaps using a toggle in code to specify safety à la Common Lisp.

Next steps

I broke tests and regressed on syntax support in the Typescript port, so that's the first step. The second step is enough syntax to support more interesting benchmarks than the fibonacci example (which has comparative performance to Node.js/V8 but isn't saying much).

After that:

  • Unboxed expressions
  • Unboxed blocks
  • Foreign-function interface
  • Self-hosting
  • Node-API compatible runtime without Node