March 27, 2019

Writing a simple JSON path parser

Let's say we want to implement a simple list filtering language so we can enter a.b = 12 and return only results in a list where the a column is an object that contains a field b that is set to the value 12. What would a filter(jsonPath, equals, listOfObjects) function look like?

If we only needed to support object lookup, we might implement filter by splitting the jsonPath on periods and look at each object in the listOfObjects for matching values. It might look something like this:

function filter(jsonPath, equals, listOfObjects) {
  const parts = jsonPath.split('.');

  function filterSingle(object) {
    let objectAtPath = object;
    let i = 0;
    for (let part = parts[i]; part && objectAtPath; part = parts[++i]) {
      objectAtPath = objectAtPath[part];
    }

    return i === parts.length && objectAtPath === equals;
  }

  return listOfObjects.filter(filterSingle);
}

require('assert').deepEqual(
  filter('foo.bar', 12, [{ foo: { bar: 12 } }, { foo: null }]),
  [{ foo: { bar: 12 } }],
);

That doesn't work too badly. We haven't handled edge cases like a jsonPath of foo..bar or bar.. But those would not be difficult to handle:

function filter(jsonPath, equals, listOfObjects) {
  if (jsonPath.charAt(0) === '.') {
    throw new Error('JSON path cannot begin with a dot, in: ' + jsonPath);
  } else if (jsonPath.charAt(jsonPath.length - 1) === '.') {
    throw new Error('JSON path cannot end with a dot, in: ' + jsonPath);
  }

  const parts = jsonPath.split('.');
  if (parts.reduce((hasEmptyPart, part) => hasEmptyPart || part.length === 0, false)) {
    throw new Error('JSON path cannot contain an empty section, in: ' + jsonPath);
  }

  function filterSingle(object) {
    let objectAtPath = object;
    let i = 0;
    for (let part = parts[i]; part && objectAtPath; part = parts[++i]) {
      objectAtPath = objectAtPath[part];
    }

    return i === parts.length && objectAtPath === equals;
  }

  return listOfObjects.filter(filterSingle);
}

And we now handle the most obvious invalid path cases.

Arrays?

If we want to support array path syntax, things get harder. For example:

require('assert').deepEqual(
  filter('foo.bar[0].biz', 14, [{ foo: { bar: [ { biz: 14 }, { biz: 19 } ] } }, { foo: { bar: null } }]),
  [{ foo: { bar: [ { biz: 14 }, { biz: 19 } ] } }],
);

We could try to stick with the hammer that is String.prototype.split and write some really messy code. :) Or we could switch to an approach that gives us more control. Let's do that.

We'll build a very simple lexer that will iterate over each character accumulating characters into individual tokens that represent the pieces of the path. Let's start by supporting the original jsonPath syntax and error-handling.

function getJsonPathParts(path) {
  const parts = [];
  let currentToken = '';

  for (let i = 0; i < path.length; i++) {
    const c = path[i];
    switch (c) {
      case '.': {
        if (!currentToken) {
          throw new Error('JSON path cannot contain empty section, in: ' + path);
        }
        parts.push(currentToken);
        currentToken = '';
        break;
      }
      default: {
        currentToken += c;
        break;
      }
    }
  }

  if (!currentToken) {
    throw new Error('JSON path cannot end with dot, in: ' + path);
  }

  parts.push(currentToken);
  return parts;
}

function filter(jsonPath, equals, listOfObjects) {
  const parts = getJsonPathParts(jsonPath);

  function filterSingle(object) {
    let objectAtPath = object;
    let i = 0;
    for (let part = parts[i]; part && objectAtPath; part = parts[++i]) {
      objectAtPath = objectAtPath[part];
    }

    return i === parts.length && objectAtPath === equals;
  }

  return listOfObjects.filter(filterSingle);
}

Not too bad!

Arrays?

Right. Let's build on getJsonPathParts to support array syntax. Along with that we're going to impose some restrictions. The object path parts must be only alphanumeric characters plus dashes and underscores. The array index must only be numeric characters. Anything else should throw an error.

function getJsonPathParts(path) {
  const parts = [];
  let currentToken = '';
  let inArray = false;

  for (let i = 0; i < path.length; i++) {
    const c = path[i];
    switch (c) {
      case '.': {
        if (currentToken === '') {
          throw new Error('JSON path cannot contain empty section, in: ' + path);
        }

        parts.push(currentToken);
        currentToken = '';
        break;
      }
      case '[': {
        if (inArray) {
          throw new Error('JSON path contains unexpected left bracket, in: ' + path);
        }

        if (currentToken === '') {
          throw new Error('JSON path cannot contain empty section, in: ' + path);
        }

        parts.push(currentToken);
        currentToken = '';
        inArray = true;
        break;
      }
      case ']': {
        if (!inArray) {
          throw new Error('JSON path contains unexpected right bracket, in: ' + path);
        }

        if (currentToken === '') {
          throw new Error('JSON path array index must not be empty, in: ' + path);
        }

        // Array indices are recorded as numbers, not strings.
        currentToken = parseInt(currentToken, 10);
        inArray = false;
        break;
      }
      default: {
        const code = c.charCodeAt(0);

        if (inArray) {
          if (code >= '0'.charCodeAt(0) && code <= '9'.charCodeAt(0)) {
            currentToken += c;
            continue;
          }

          throw new Error('JSON path array index must be numeric, in: ' + path);
        }

        if ((code >= 'A'.charCodeAt(0) && code <= 'z'.charCodeAt(0)) ||
            (code >= '0'.charCodeAt(0) && code <= '9'.charCodeAt(0)) ||
            ['-', '_'].includes(c)) {
          currentToken += c;
          continue;
        }

        throw new Error('JSON path part must contain only alphanumeric characters, in: ' + path);
      }
    }
  }

  if (currentToken === '') {
    throw new Error('JSON path cannot end with dot, in: ' + path);
  }

  parts.push(currentToken);
  return parts;
}

require('assert').deepEqual(getJsonPathParts('foo.bar[0].biz'), ['foo', 'bar', 0, 'biz']);

Now we've got a simple JSON path parser with decent error handling! Of course we wouldn't want to use this little library in production until we had some serious test coverage. But writing tests and calling out my mistakes will be left here as an exercise for the reader. :)