Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
/** | |
* Represents tokens that our language understands in parsing. | |
*/ | |
export const TOKEN_TYPES = Object.freeze({ | |
Text: "Text", // The text between Jinja statements or expressions | |
NumericLiteral: "NumericLiteral", // e.g., 123 | |
BooleanLiteral: "BooleanLiteral", // true or false | |
StringLiteral: "StringLiteral", // 'string' | |
Identifier: "Identifier", // Variables, functions, etc. | |
Equals: "Equals", // = | |
OpenParen: "OpenParen", // ( | |
CloseParen: "CloseParen", // ) | |
OpenStatement: "OpenStatement", // {% | |
CloseStatement: "CloseStatement", // %} | |
OpenExpression: "OpenExpression", // {{ | |
CloseExpression: "CloseExpression", // }} | |
OpenSquareBracket: "OpenSquareBracket", // [ | |
CloseSquareBracket: "CloseSquareBracket", // ] | |
OpenCurlyBracket: "OpenCurlyBracket", // { | |
CloseCurlyBracket: "CloseCurlyBracket", // } | |
Comma: "Comma", // , | |
Dot: "Dot", // . | |
Colon: "Colon", // : | |
Pipe: "Pipe", // | | |
CallOperator: "CallOperator", // () | |
AdditiveBinaryOperator: "AdditiveBinaryOperator", // + - | |
MultiplicativeBinaryOperator: "MultiplicativeBinaryOperator", // * / % | |
ComparisonBinaryOperator: "ComparisonBinaryOperator", // < > <= >= == != | |
UnaryOperator: "UnaryOperator", // ! - + | |
// Keywords | |
Set: "Set", | |
If: "If", | |
For: "For", | |
In: "In", | |
Is: "Is", | |
NotIn: "NotIn", | |
Else: "Else", | |
EndIf: "EndIf", | |
ElseIf: "ElseIf", | |
EndFor: "EndFor", | |
And: "And", | |
Or: "Or", | |
Not: "UnaryOperator", | |
}); | |
export type TokenType = keyof typeof TOKEN_TYPES; | |
/** | |
* Constant lookup for keywords and known identifiers + symbols. | |
*/ | |
const KEYWORDS = Object.freeze({ | |
set: TOKEN_TYPES.Set, | |
for: TOKEN_TYPES.For, | |
in: TOKEN_TYPES.In, | |
is: TOKEN_TYPES.Is, | |
if: TOKEN_TYPES.If, | |
else: TOKEN_TYPES.Else, | |
endif: TOKEN_TYPES.EndIf, | |
elif: TOKEN_TYPES.ElseIf, | |
endfor: TOKEN_TYPES.EndFor, | |
and: TOKEN_TYPES.And, | |
or: TOKEN_TYPES.Or, | |
not: TOKEN_TYPES.Not, | |
"not in": TOKEN_TYPES.NotIn, | |
// Literals | |
true: TOKEN_TYPES.BooleanLiteral, | |
false: TOKEN_TYPES.BooleanLiteral, | |
}); | |
/** | |
* Represents a single token in the template. | |
*/ | |
export class Token { | |
/** | |
* Constructs a new Token. | |
* @param {string} value The raw value as seen inside the source code. | |
* @param {TokenType} type The type of token. | |
*/ | |
constructor( | |
public value: string, | |
public type: TokenType | |
) {} | |
} | |
function isWord(char: string): boolean { | |
return /\w/.test(char); | |
} | |
function isInteger(char: string): boolean { | |
return /[0-9]/.test(char); | |
} | |
/** | |
* A data structure which contains a list of rules to test | |
*/ | |
const ORDERED_MAPPING_TABLE: [string, TokenType][] = [ | |
// Control sequences | |
["{%", TOKEN_TYPES.OpenStatement], | |
["%}", TOKEN_TYPES.CloseStatement], | |
["{{", TOKEN_TYPES.OpenExpression], | |
["}}", TOKEN_TYPES.CloseExpression], | |
// Single character tokens | |
["(", TOKEN_TYPES.OpenParen], | |
[")", TOKEN_TYPES.CloseParen], | |
["{", TOKEN_TYPES.OpenCurlyBracket], | |
["}", TOKEN_TYPES.CloseCurlyBracket], | |
["[", TOKEN_TYPES.OpenSquareBracket], | |
["]", TOKEN_TYPES.CloseSquareBracket], | |
[",", TOKEN_TYPES.Comma], | |
[".", TOKEN_TYPES.Dot], | |
[":", TOKEN_TYPES.Colon], | |
["|", TOKEN_TYPES.Pipe], | |
// Comparison operators | |
["<=", TOKEN_TYPES.ComparisonBinaryOperator], | |
[">=", TOKEN_TYPES.ComparisonBinaryOperator], | |
["==", TOKEN_TYPES.ComparisonBinaryOperator], | |
["!=", TOKEN_TYPES.ComparisonBinaryOperator], | |
["<", TOKEN_TYPES.ComparisonBinaryOperator], | |
[">", TOKEN_TYPES.ComparisonBinaryOperator], | |
// Arithmetic operators | |
["+", TOKEN_TYPES.AdditiveBinaryOperator], | |
["-", TOKEN_TYPES.AdditiveBinaryOperator], | |
["*", TOKEN_TYPES.MultiplicativeBinaryOperator], | |
["/", TOKEN_TYPES.MultiplicativeBinaryOperator], | |
["%", TOKEN_TYPES.MultiplicativeBinaryOperator], | |
// Assignment operator | |
["=", TOKEN_TYPES.Equals], | |
]; | |
const ESCAPE_CHARACTERS = new Map([ | |
["n", "\n"], // New line | |
["t", "\t"], // Horizontal tab | |
["r", "\r"], // Carriage return | |
["b", "\b"], // Backspace | |
["f", "\f"], // Form feed | |
["v", "\v"], // Vertical tab | |
["'", "'"], // Single quote | |
['"', '"'], // Double quote | |
["\\", "\\"], // Backslash | |
]); | |
export interface PreprocessOptions { | |
trim_blocks?: boolean; | |
lstrip_blocks?: boolean; | |
} | |
function preprocess(template: string, options: PreprocessOptions = {}): string { | |
// According to https://jinja.palletsprojects.com/en/3.0.x/templates/#whitespace-control | |
// In the default configuration: | |
// - a single trailing newline is stripped if present | |
// - other whitespace (spaces, tabs, newlines etc.) is returned unchanged | |
if (template.endsWith("\n")) { | |
template = template.slice(0, -1); | |
} | |
// Replace all comments with a placeholder | |
// This ensures that comments don't interfere with the following options | |
template = template.replace(/{#.*?#}/gs, "{##}"); | |
if (options.lstrip_blocks) { | |
// The lstrip_blocks option can also be set to strip tabs and spaces from the | |
// beginning of a line to the start of a block. (Nothing will be stripped if | |
// there are other characters before the start of the block.) | |
template = template.replace(/^[ \t]*({[#%])/gm, "$1"); | |
} | |
if (options.trim_blocks) { | |
// If an application configures Jinja to trim_blocks, the first newline after | |
// a template tag is removed automatically (like in PHP). | |
template = template.replace(/([#%]})\n/g, "$1"); | |
} | |
return template | |
.replace(/{##}/g, "") // Remove comments | |
.replace(/-%}\s*/g, "%}") | |
.replace(/\s*{%-/g, "{%") | |
.replace(/-}}\s*/g, "}}") | |
.replace(/\s*{{-/g, "{{"); | |
} | |
/** | |
* Generate a list of tokens from a source string. | |
*/ | |
export function tokenize(source: string, options: PreprocessOptions = {}): Token[] { | |
const tokens: Token[] = []; | |
const src: string = preprocess(source, options); | |
let cursorPosition = 0; | |
const consumeWhile = (predicate: (char: string) => boolean): string => { | |
let str = ""; | |
while (predicate(src[cursorPosition])) { | |
// Check for escaped characters | |
if (src[cursorPosition] === "\\") { | |
// Consume the backslash | |
++cursorPosition; | |
// Check for end of input | |
if (cursorPosition >= src.length) throw new SyntaxError("Unexpected end of input"); | |
// Add the escaped character | |
const escaped = src[cursorPosition++]; | |
const unescaped = ESCAPE_CHARACTERS.get(escaped); | |
if (unescaped === undefined) { | |
throw new SyntaxError(`Unexpected escaped character: ${escaped}`); | |
} | |
str += unescaped; | |
continue; | |
} | |
str += src[cursorPosition++]; | |
if (cursorPosition >= src.length) throw new SyntaxError("Unexpected end of input"); | |
} | |
return str; | |
}; | |
// Build each token until end of input | |
main: while (cursorPosition < src.length) { | |
// First, consume all text that is outside of a Jinja statement or expression | |
const lastTokenType = tokens.at(-1)?.type; | |
if ( | |
lastTokenType === undefined || | |
lastTokenType === TOKEN_TYPES.CloseStatement || | |
lastTokenType === TOKEN_TYPES.CloseExpression | |
) { | |
let text = ""; | |
while ( | |
cursorPosition < src.length && | |
// Keep going until we hit the next Jinja statement or expression | |
!(src[cursorPosition] === "{" && (src[cursorPosition + 1] === "%" || src[cursorPosition + 1] === "{")) | |
) { | |
// Consume text | |
text += src[cursorPosition++]; | |
} | |
// There is some text to add | |
if (text.length > 0) { | |
tokens.push(new Token(text, TOKEN_TYPES.Text)); | |
continue; | |
} | |
} | |
// Consume (and ignore) all whitespace inside Jinja statements or expressions | |
consumeWhile((char) => /\s/.test(char)); | |
// Handle multi-character tokens | |
const char = src[cursorPosition]; | |
// Check for unary operators | |
if (char === "-" || char === "+") { | |
const lastTokenType = tokens.at(-1)?.type; | |
if (lastTokenType === TOKEN_TYPES.Text || lastTokenType === undefined) { | |
throw new SyntaxError(`Unexpected character: ${char}`); | |
} | |
switch (lastTokenType) { | |
case TOKEN_TYPES.Identifier: | |
case TOKEN_TYPES.NumericLiteral: | |
case TOKEN_TYPES.BooleanLiteral: | |
case TOKEN_TYPES.StringLiteral: | |
case TOKEN_TYPES.CloseParen: | |
case TOKEN_TYPES.CloseSquareBracket: | |
// Part of a binary operator | |
// a - 1, 1 - 1, true - 1, "apple" - 1, (1) - 1, a[1] - 1 | |
// Continue parsing normally | |
break; | |
default: { | |
// Is part of a unary operator | |
// (-1), [-1], (1 + -1), not -1, -apple | |
++cursorPosition; // consume the unary operator | |
// Check for numbers following the unary operator | |
const num = consumeWhile(isInteger); | |
tokens.push( | |
new Token(`${char}${num}`, num.length > 0 ? TOKEN_TYPES.NumericLiteral : TOKEN_TYPES.UnaryOperator) | |
); | |
continue; | |
} | |
} | |
} | |
// Try to match one of the tokens in the mapping table | |
for (const [char, token] of ORDERED_MAPPING_TABLE) { | |
const slice = src.slice(cursorPosition, cursorPosition + char.length); | |
if (slice === char) { | |
tokens.push(new Token(char, token)); | |
cursorPosition += char.length; | |
continue main; | |
} | |
} | |
if (char === "'" || char === '"') { | |
++cursorPosition; // Skip the opening quote | |
const str = consumeWhile((c) => c !== char); | |
tokens.push(new Token(str, TOKEN_TYPES.StringLiteral)); | |
++cursorPosition; // Skip the closing quote | |
continue; | |
} | |
if (isInteger(char)) { | |
const num = consumeWhile(isInteger); | |
tokens.push(new Token(num, TOKEN_TYPES.NumericLiteral)); | |
continue; | |
} | |
if (isWord(char)) { | |
const word = consumeWhile(isWord); | |
// Check for special/reserved keywords | |
// NOTE: We use Object.hasOwn() to avoid matching `.toString()` and other Object methods | |
const type = Object.hasOwn(KEYWORDS, word) ? KEYWORDS[word as keyof typeof KEYWORDS] : TOKEN_TYPES.Identifier; | |
// Special case of not in: | |
// If the previous token was a "not", and this token is "in" | |
// then we want to combine them into a single token | |
if (type === TOKEN_TYPES.In && tokens.at(-1)?.type === TOKEN_TYPES.Not) { | |
tokens.pop(); | |
tokens.push(new Token("not in", TOKEN_TYPES.NotIn)); | |
} else { | |
tokens.push(new Token(word, type)); | |
} | |
continue; | |
} | |
throw new SyntaxError(`Unexpected character: ${char}`); | |
} | |
return tokens; | |
} | |