std.experimental.lexer 31/41(75%) line coverage

      
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
510
520
530
540
550
560
570
580
590
600
610
620
630
640
650
660
670
680
690
700
710
720
730
740
750
760
770
780
790
800
810
820
830
840
850
860
870
880
890
900
910
920
930
940
950
960
970
980
990
1000
1010
1020
1030
1040
1050
1060
1070
1080
1090
1100
1110
1120
1130
1140
1150
1160
1170
1180
1190
1200
1210
1220
1230
1240
1250
1260
1270
1280
1290
1300
1310
1320
1330
1340
1350
1360
1370
1380
1390
1400
1410
1420
1430
1440
1450
1460
1470
1480
1490
1500
1510
1520
1530
1540
1550
1560
1570
1580
1590
1600
1610
1620
1630
1640
1650
1660
1670
1680
1691844449
1700
1711844449
1721844449
1730
1740
1750
1760
1770
1780
1790
1800
1810
1820
1830
1840
1850
1860
1870
1880
1890
1900
1910
1920
1930
1940
1950
1960
1970
1980
1990
2000
2010
2020
2030
2040
2050
2060
2070
2080
2090
2100
2110
2120
2130
2140
2150
2160
2170
2180
2190
2200
2210
2220
2230
2240
2250
2260
2270
2280
2290
2300
2310
2320
2330
2340
2350
2360
2370
2380
2390
2400
2410
2420
2430
2440
2450
2460
2470
2480
2490
2500
2510
2520
2530
2540
2550
2560
2570
2580
2590
2600
2610
2620
2630
2640
2650
2660
2670
2680
2690
2700
271222
2720
2730
2740
2750
2760
2770
2780
2790
2800
2810
2820
2830
2840
2850
2860
2870
2880
2890
2900
2910
2920
293328099
2940
295328099
296328099
297328099
298328099
299328099
3000
3010
3020
3030
3040
3050
3060
3070
3080
3090
3100
3110
3120
3130
3140
3150
3160
3170
3180
3190
3200
3210
3220
3230
3240
3250
3260
3270
3280
3290
3300
3310
3320
3330
3340
3350
3360
3370
3380
3390
3400
3410
3420
3430
3440
3450
3460
3470
3480
3490
3500
3510
3520
3530
3540
3550
3560
3570
3580
3590
3600
3610
3620
3630
3640
3650
3660
3670
3680
3690
3700
3710
3720
3730
3740
3750
3760
3770
3780
3790
3800
3810
3820
3830
3840
3850
3860
3870
3880
3890
3900
3910
3920
3930
3940
3950
3960
3970
3980
3990
4000
4010
4020
4030
4040
4050
4060
4070
4080
4090
4100
4110
4120
4130
4140
4150
4160
4170
4180
4190
4200
4210
4220
4230
4240
4250
4260
4270
4280
4290
4300
4310
4320
4330
4340
4350
4360
4370
4380
4390
4400
4410
4420
4430
4440
4450
4460
4470
4480
4490
4500
4510
4520
4530
4540
4550
4560
4570
4580
4590
4600
4610
4620
4630
4640
4650
4660
4670
4680
4690
4700
4710
4720
4730
4740
4750
4760
4770
4780
4790
4800
4810
4820
4830
4840
4850
4860
4870
4880
4890
4900
4910
4920
4930
4940
4950
4960
4970
4980
4990
5000
5010
5020
5030
5040
5050
5060
5070
5080
5090
5100
5110
5120
5130
5140
5150
5160
5170
5180
5190
5200
5210
5220
5230
5240
5250
5260
5270
5280
5290
5300
5310
5320
5330
5340
5350
5360
5370
5380
5390
5400
5410
5420
5430
5440
5450
5460
5470
5480
5490
5500
5510
5520
5530
5540
5550
5560
5570
5580
5590
5600
5610
5620
5630
5640
5650
5660
5670
5680
5690
5700
5710
5720
5730
5740
5750
5760
5770
5780
5790
5800
5810
5820
5830
5840
5850
5860
5870
5880
5890
5900
5910
5920
5930
5940
5950
5960
5970
5980
5990
6000
6010
6020
6030
6040
6050
6060
6070
6080
6090
6100
6110
6120
6130
6140
6150
6160
6170
6180
6190
6200
6210
6220
6230
6240
6250
6260
6270
6280
6290
6300
6310
6320
6330
6340
6350
6360
6370
6380
6390
6400
6410
6420
6430
6440
6450
6460
6470
6480
6490
6500
6510
6520
6530
6540
6550
6560
6570
6580
6590
6600
6610
6620
6630
6640
6650
6660
6670
6680
6690
6700
6710
6720
6730
6740
6750
6760
6770
6780
6790
6800
6810
6820
6830
6840
6850
6860
6870
6880
6890
6900
6910
6920
6930
6940
6950
6960
6970
6980
6990
7000
7010
7020
7030
7040
7050
7060
7070
7080
7090
7100
7110
7120
7130
7140
7150
7160
7170
7180
7190
7200
7210
7220
7230
7240
7250
7260
7270
7280
7290
7300
7310
7320
7330
7340
7350
73638
7370
73838
73938
74038
74138
7420
7430
7440
7450
7460
7470
7480
74978548
7500
7510
7520
7530
7540
7550
7560
7570
7580
7590
7600
7610
7620
7630
7640
7650
7660
7670
76869195
7690
7700
7710
7720
7730
7740
7750
776117
7770
7780
7790
7800
7810
7820
7830
7840
7850
7860
7870
7880
7890
7900
7910
792258
7930
794258
7950
7960
7970
7980
7990
8000
8010
802144
8030
8044497
8051355
8060
807144
8080
8090
8100
8110
8120
8130
8140
8159422
8160
8170
8180
8190
8200
8210
8220
8230
8240
8250
8260
8270
8280
8290
8300
83141673
83241673
8330
8340
8350
8360
8370
8380
8390
8400
841819
842819
8430
8440
8450
8460
8470
8480
8490
850649
851649
8520
8530
8540
8550
8560
8570
8580
8590
8600
8610
8620
8630
8640
8650
8660
8670
8680
8690
8700
8710
8720
8730
// Written in the D programming language /** * $(H2 Summary) * This module contains a range-based compile-time _lexer generator. * * $(H2 Overview) * The _lexer generator consists of a template mixin, $(LREF Lexer), along with * several helper templates for generating such things as token identifiers. * * To write a _lexer using this API: * $(OL * $(LI Create the string array constants for your language. * $(UL * $(LI $(LINK2 #.staticTokens, staticTokens)) * $(LI $(LINK2 #.dynamicTokens, dynamicTokens)) * $(LI $(LINK2 #.possibleDefaultTokens, possibleDefaultTokens)) * $(LI $(LINK2 #.tokenHandlers, tokenHandlers)) * )) * $(LI Create aliases for the various token and token identifier types * specific to your language. * $(UL * $(LI $(LREF TokenIdType)) * $(LI $(LREF tokenStringRepresentation)) * $(LI $(LREF TokenStructure)) * $(LI $(LREF TokenId)) * )) * $(LI Create a struct that mixes in the Lexer template mixin and * implements the necessary functions. * $(UL * $(LI $(LREF Lexer)) * )) * ) * Examples: * $(UL * $(LI A _lexer for D is available $(LINK2 https://github.com/Hackerpilot/Dscanner/blob/master/std/d/lexer.d, here).) * $(LI A _lexer for Lua is available $(LINK2 https://github.com/Hackerpilot/lexer-demo/blob/master/lualexer.d, here).) * $(LI A _lexer for JSON is available $(LINK2 https://github.com/Hackerpilot/lexer-demo/blob/master/jsonlexer.d, here).) * ) * $(DDOC_ANCHOR TemplateParameters) $(H2 Template Parameter Definitions) * $(DL * $(DT $(DDOC_ANCHOR defaultTokenFunction) $(B defaultTokenFunction) * $(DD A function that serves as the default token lexing function. For most * languages this will be the identifier lexing function.)) * $(DT $(DDOC_ANCHOR tokenSeparatingFunction) $(B tokenSeparatingFunction)) * $(DD A function that is able to determine if an identifier/keyword has come * to an end. This function must return bool and take a single size_t * argument representing the number of bytes to skip over before looking for * a separating character.) * $(DT $(DDOC_ANCHOR staticTokens) $(B staticTokens)) * $(DD A listing of the tokens whose exact value never changes and which cannot * possibly be a token handled by the default token lexing function. The * most common example of this kind of token is an operator such as * $(D_STRING "*"), or $(D_STRING "-") in a programming language.) * $(DT $(DDOC_ANCHOR dynamicTokens) $(B dynamicTokens)) * $(DD A listing of tokens whose value is variable, such as whitespace, * identifiers, number literals, and string literals.) * $(DT $(DDOC_ANCHOR possibleDefaultTokens) $(B possibleDefaultTokens)) * $(DD A listing of tokens that could posibly be one of the tokens handled by * the default token handling function. An common example of this is * a keyword such as $(D_STRING "for"), which looks like the beginning of * the identifier $(D_STRING "fortunate"). $(B tokenSeparatingFunction) is * called to determine if the character after the $(D_STRING 'r') separates * the identifier, indicating that the token is $(D_STRING "for"), or if * lexing should be turned over to the $(B defaultTokenFunction).) * $(DT $(DDOC_ANCHOR tokenHandlers) $(B tokenHandlers)) * $(DD A mapping of prefixes to custom token handling function names. The * generated _lexer will search for the even-index elements of this array, * and then call the function whose name is the element immedately after the * even-indexed element. This is used for lexing complex tokens whose prefix * is fixed.) * ) * * Here are some example constants for a simple calculator _lexer: * --- * // There are a near infinite number of valid number literals, so numbers are * // dynamic tokens. * enum string[] dynamicTokens = ["numberLiteral", "whitespace"]; * * // The operators are always the same, and cannot start a numberLiteral, so * // they are staticTokens * enum string[] staticTokens = ["-", "+", "*", "/"]; * * // In this simple example there are no keywords or other tokens that could * // look like dynamic tokens, so this is blank. * enum string[] possibleDefaultTokens = []; * * // If any whitespace character or digit is encountered, pass lexing over to * // our custom handler functions. These will be demonstrated in an example * // later on. * enum string[] tokenHandlers = [ * "0", "lexNumber", * "1", "lexNumber", * "2", "lexNumber", * "3", "lexNumber", * "4", "lexNumber", * "5", "lexNumber", * "6", "lexNumber", * "7", "lexNumber", * "8", "lexNumber", * "9", "lexNumber", * " ", "lexWhitespace", * "\n", "lexWhitespace", * "\t", "lexWhitespace", * "\r", "lexWhitespace" * ]; * --- * * Copyright: Brian Schott 2013 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt Boost, License 1.0) * Authors: Brian Schott, with ideas shamelessly stolen from Andrei Alexandrescu * Source: $(PHOBOSSRC std/experimental/_lexer.d) */ module std.experimental.lexer; /** * Template for determining the type used for a token type. * * Selects the smallest unsigned integral type that is able to hold the value * staticTokens.length + dynamicTokens.length + possibleDefaultTokens.length. * For example if there are 20 static tokens, 30 dynamic tokens, * and 10 possible default tokens, this template will alias itself to ubyte, * as 20 + 30 + 10 < $(D_KEYWORD ubyte).max. * Examples: * --- * // In our calculator example this means that IdType is an alias for ubyte. * alias IdType = TokenIdType!(staticTokens, dynamicTokens, possibleDefaultTokens); * --- */ template TokenIdType(alias staticTokens, alias dynamicTokens, alias possibleDefaultTokens) { immutable tokenCount = staticTokens.length + dynamicTokens.length + possibleDefaultTokens.length + 1; static if (tokenCount <= ubyte.max) alias TokenIdType = ubyte; else static if (tokenCount <= ushort.max) alias TokenIdType = ushort; else static if (tokenCount <= uint.max) alias TokenIdType = uint; else static assert (false, "The number of tokens must be less than uint.max"); } /** * Looks up the string representation of the given token type. * * This is the opposite of the function of the TokenId template. * Params: type = the token type identifier * Examples: * --- * alias str = tokenStringRepresentation(IdType, staticTokens, dynamicTokens, possibleDefaultTokens); * assert (str(tok!"*") == "*"); * --- * See_also: $(LREF TokenId) */ string tokenStringRepresentation(IdType, alias staticTokens, alias dynamicTokens, alias possibleDefaultTokens)(IdType type) pure nothrow @property @nogc @safe { // hax static auto f() pure nothrow @trusted { return cast(immutable) staticTokens ~ dynamicTokens ~ possibleDefaultTokens; } static immutable tokens = f(); if (type == 0) return "!ERROR!"; else if (type < tokens.length + 1) return tokens[type - 1]; else return null; } unittest { alias IdType = TokenIdType!(["foo"], ["bar"], ["doo"]); enum tok(string token) = TokenId!(IdType, ["foo"], ["bar"], ["doo"], token); alias str = tokenStringRepresentation!(IdType, ["foo"], ["bar"], ["doo"]); static assert (str(tok!"foo") == "foo"); static assert (str(tok!"bar") == "bar"); static assert (str(tok!"doo") == "doo"); } /** * Generates the token type identifier for the given symbol. * * There are two special cases: * $(UL * $(LI If symbol is $(D_STRING ""), then the token identifier will be 0) * $(LI If symbol is $(D_STRING "\0"), then the token identifier will be the maximum * valid token type identifier) * ) * In all cases this template will alias itself to a constant of type IdType. * This template will fail at compile time if $(D_PARAM symbol) is not one of * the staticTokens, dynamicTokens, or possibleDefaultTokens. * Examples: * --- * template tok(string symbol) * { * alias tok = TokenId!(IdType, staticTokens, dynamicTokens, * possibleDefaultTokens, symbol); * } * // num and plus are of type ubyte. * IdType plus = tok!"+"; * IdType num = tok!"numberLiteral"; * --- */ template TokenId(IdType, alias staticTokens, alias dynamicTokens, alias possibleDefaultTokens, string symbol) { enum tokens = staticTokens ~ dynamicTokens ~ possibleDefaultTokens; import std.algorithm; static if (symbol == "") { enum id = 0; alias TokenId = id; } else static if (symbol == "\0") { enum id = 1 + tokens.length; alias TokenId = id; } else { enum i = tokens.countUntil(symbol); static if (i != -1) { enum id = i + 1; static assert (id >= 0 && id < IdType.max, "Invalid token: " ~ symbol); alias TokenId = id; } else static assert (0, "Invalid token: " ~ symbol); } } /** * The token that is returned by the lexer. * Params: * IdType = The D type of the "type" token type field. * extraFields = A string containing D code for any extra fields that should * be included in the token structure body. This string is passed * directly to a mixin statement. * Examples: * --- * // No extra struct fields are desired in this example, so leave it blank. * alias Token = TokenStructure!(IdType, ""); * Token minusToken = Token(tok!"-"); * --- */ struct TokenStructure(IdType, string extraFields = "") { public pure nothrow @safe @nogc { bool opEquals(ref const typeof(this) other) const { return this.type == other.type && this.text == other.text; } /** * Returns: true if the token has the given type, false otherwise. */ bool opEquals(IdType type) const { return this.type == type; } /** * Constructs a token from a token type. * Params: type = the token type */ this(IdType type) { this.type = type; } /** * Constructs a token. * Params: * type = the token type * text = the text of the token, which may be null * line = the line number at which this token occurs * column = the column number at which this token occurs * index = the byte offset from the beginning of the input at which this * token occurs */ this(IdType type, string text, size_t line, size_t column, size_t index) { this.text = text; this.line = line; this.column = column; this.type = type; this.index = index; } /** * The _text of the token. */ string text; /** * The _line number at which this token occurs. */ size_t line; /** * The _column number at which this token occurs. This is measured in bytes * and may not be correct when tab characters are involved. */ size_t column; /** * The byte offset from the beginning of the input at which this token * occurs. */ size_t index; /** * The token type. */ IdType type; } mixin (extraFields); } /** * The implementation of the _lexer is contained within this mixin template. * * To use it, this template should be mixed in to a struct that represents the * _lexer for your language. This struct should implement the following methods: * $(UL * $(LI popFront, which should call this mixin's _popFront() and * additionally perform any token filtering or shuffling you deem * necessary. For example, you can implement popFront to skip comment or * tokens.) * $(LI A function that serves as the default token lexing function. For * most languages this will be the identifier lexing function. This * should then be passed to the $(LREF Lexer) template mixin as the * $(LINK2 #.defaultTokenFunction defaultTokenFunction) template * parameter.) * $(LI A function that is able to determine if an identifier/keyword has * come to an end. This function must return $(D_KEYWORD bool) and take * a single $(D_KEYWORD size_t) argument representing the number of * bytes to skip over before looking for a separating character.) * $(LI Any functions referred to in the tokenHandlers template paramater. * These functions must be marked $(D_KEYWORD pure nothrow), take no * arguments, and return a token) * $(LI A constructor that initializes the range field as well as calls * popFront() exactly once (to initialize the _front field).) * ) * Params: * Token = $(LREF TokenStructure) * defaultTokenFunction = $(LINK2 #.defaultTokenFunction, defaultTokenFunction) * tokenSeparatingFunction = $(LINK2 #.tokenSeparatingFunction, tokenSeparatingFunction) * staticTokens = $(LINK2 #.staticTokens, staticTokens) * dynamicTokens = $(LINK2 #.dynamicTokens, dynamicTokens) * possibleDefaultTokens = $(LINK2 #.possibleDefaultTokens, possibleDefaultTokens) * tokenHandlers = $(LINK2 #.tokenHandlers, tokenHandlers) * Examples: * --- * struct CalculatorLexer * { * mixin Lexer!(IdType, Token, defaultTokenFunction, isSeparating, * staticTokens, dynamicTokens, possibleDefaultTokens, tokenHandlers); * * this (ubyte[] bytes) * { * this.range = LexerRange(bytes); * popFront(); * } * * void popFront() pure * { * _popFront(); * } * * Token lexNumber() pure nothrow @safe * { * // implementation goes here * } * * Token lexWhitespace() pure nothrow @safe * { * // implementation goes here * } * * Token defaultTokenFunction() pure nothrow @safe * { * // There is no default token in the example calculator language, so * // this is always an error. * range.popFront(); * return Token(tok!""); * } * * bool isSeparating(size_t offset) pure nothrow @safe * { * // For this example language, always return true. * return true; * } * } * --- */ mixin template Lexer(Token, alias defaultTokenFunction, alias tokenSeparatingFunction, alias staticTokens, alias dynamicTokens, alias possibleDefaultTokens, alias tokenHandlers) { private alias _IDType = typeof(Token.type); private enum _tok(string symbol) = TokenId!(_IDType, staticTokens, dynamicTokens, possibleDefaultTokens, symbol); static assert (tokenHandlers.length % 2 == 0, "Each pseudo-token must" ~ " have a corresponding handler function name."); static string generateMask(const ubyte[] arr) { import std.string : format; ulong u; for (size_t i = 0; i < arr.length && i < 8; i++) { u |= (cast(ulong) arr[i]) << (i * 8); } return format("0x%016x", u); } private static string generateByteMask(size_t l) { import std.string : format; return format("0x%016x", ulong.max >> ((8 - l) * 8)); } private static size_t calcSplitCount(size_t a, size_t b) pure nothrow { int i; while (true) { i++; a /= 2; if (a < b) break; } return i; } private static char[] getBeginningChars(string[] allTokens) { char[] beginningChars; for (size_t i = 0; i < allTokens.length; i++) { if (allTokens[i].length == 0) continue; beginningChars ~= allTokens[i][0]; size_t j = i + 1; while (j < allTokens.length && allTokens[i][0] == allTokens[j][0]) j++; i = j - 1; } return beginningChars; } private static string generateStatements() { import std.algorithm : sort; import std.range : stride; string[] pseudoTokens = array(tokenHandlers.stride(2)); string[] allTokens = array(sort(staticTokens ~ possibleDefaultTokens ~ pseudoTokens).uniq()); // Array consisting of a sorted list of the first characters of the // tokens. char[] beginningChars = getBeginningChars(allTokens); size_t i = calcSplitCount(beginningChars.length, 8); return generateStatementsStep(allTokens, pseudoTokens, beginningChars, i); } private static string generateStatementsStep(string[] allTokens, string[] pseudoTokens, char[] chars, size_t i, string indent = "") { import std.string : format; string code; if (i > 0) { size_t p = chars.length / 2; code ~= indent ~ format("if (f < 0x%02x) // %s \n%s{\n", chars[p], chars[p], indent); code ~= generateStatementsStep(allTokens, pseudoTokens, chars[0 .. p], i - 1, indent ~ " "); code ~= indent ~ "}\n" ~ indent ~ "else\n" ~ indent ~ "{\n"; code ~= generateStatementsStep(allTokens, pseudoTokens, chars[p .. $], i - 1, indent ~ " "); code ~= indent ~ "}\n"; } else { code ~= indent ~ "switch (f)\n" ~ indent ~ "{\n"; foreach (char c; chars) { size_t begin; size_t end; for (size_t j = 0; j < allTokens.length; j++) { if (allTokens[j].length == 0 || allTokens[j][0] != c) continue; begin = j; end = j + 1; while (end < allTokens.length && allTokens[begin][0] == allTokens[end][0]) end++; break; } code ~= format("%scase 0x%02x:\n", indent, c); code ~= printCase(allTokens[begin .. end], pseudoTokens, indent ~ " "); } code ~= indent ~ "default: goto _defaultTokenFunction;\n"; code ~= indent ~ "}\n"; } return code; } private static string printCase(string[] tokens, string[] pseudoTokens, string indent) { import std.array : array; import std.algorithm : countUntil; import std.conv : text; string[] sortedTokens = array(sort!"a.length > b.length"(tokens)); if (tokens.length == 1 && tokens[0].length == 1) { if (pseudoTokens.countUntil(tokens[0]) >= 0) { return indent ~ tokenHandlers[tokenHandlers.countUntil(tokens[0]) + 1] ~ "(token);\n" ~ indent ~ "return;\n"; } else if (staticTokens.countUntil(tokens[0]) >= 0) { return indent ~ "range.index++; range.column++;\n" ~ indent ~ "token = Token(_tok!\"" ~ escape(tokens[0]) ~ "\", null, line, column, index);\n" ~ indent ~ "return;\n"; } else if (pseudoTokens.countUntil(tokens[0]) >= 0) { return indent ~ tokenHandlers[tokenHandlers.countUntil(tokens[0]) + 1] ~ "(token);\n" ~ indent ~ "return;\n"; } } string code; bool insertTrailingGoto = true; foreach (i, token; sortedTokens) { immutable mask = generateMask(cast (const ubyte[]) token); if (token.length >= 8) code ~= indent ~ "if (frontBytes == " ~ mask ~ ")\n"; else if (token.length != 1) code ~= indent ~ "if ((frontBytes & " ~ generateByteMask(token.length) ~ ") == " ~ mask ~ ")\n"; if (token.length != 1) code ~= indent ~ "{\n"; if (pseudoTokens.countUntil(token) >= 0) { if (token.length <= 8) { if (token.length == 1) insertTrailingGoto = false; code ~= (token.length == 1 ? indent : indent ~ " ") ~ tokenHandlers[tokenHandlers.countUntil(token) + 1] ~ "(token);\n"; code ~= (token.length == 1 ? indent : indent ~ " ") ~ "return;\n"; } else { code ~= indent ~ " if (range.startsWith(cast (ubyte[]) \"" ~ escape(token) ~ "\")\n"; code ~= indent ~ " " ~ tokenHandlers[tokenHandlers.countUntil(token) + 1] ~ "();\n"; code ~= indent ~ " return;\n"; } } else if (staticTokens.countUntil(token) >= 0) { if (token.length <= 8) { if (token.length == 1) insertTrailingGoto = false; code ~= indent ~ (token.length != 1 ? " " : "") ~ "range.index += " ~ text(token.length) ~ "; range.column += " ~ text(token.length) ~ ";\n"; code ~= indent ~ (token.length != 1 ? " " : "") ~ "token = Token(_tok!\"" ~ escape(token) ~ "\", null, line, column, index);\n"; code ~= indent ~ (token.length != 1 ? " " : "") ~ "return;\n"; } else { code ~= indent ~ " pragma(msg, \"long static tokens not supported\"); // " ~ escape(token) ~ "\n"; } } else { // possible default if (token.length <= 8) { code ~= indent ~ " if (tokenSeparatingFunction(" ~ text(token.length) ~ "))\n"; code ~= indent ~ " {\n"; code ~= indent ~ " range.index += " ~ text(token.length) ~ "; range.column += " ~ text(token.length) ~ ";\n"; code ~= indent ~ " token = Token(_tok!\"" ~ escape(token) ~ "\", null, line, column, index);\n"; code ~= indent ~ " return;\n"; code ~= indent ~ " }\n"; code ~= indent ~ " else\n"; code ~= indent ~ " goto _defaultTokenFunction;\n"; } else { code ~= indent ~ " if (range.startsWith(cast (ubyte[]) \"" ~ escape(token) ~"\") && isSeparating(" ~ text(token.length) ~ "))\n"; code ~= indent ~ " {\n"; code ~= indent ~ " range.index += " ~ text(token.length) ~ "; range.column += " ~ text(token.length) ~ ";\n"; code ~= indent ~ " token = Token(_tok!\"" ~ escape(token) ~ "\", null, line, column, index);\n"; code ~= indent ~ " return;\n"; code ~= indent ~ " }\n"; code ~= indent ~ " else\n"; code ~= indent ~ " goto _defaultTokenFunction;\n"; } } if (token.length != 1) { code ~= indent ~ "}\n"; } } if (insertTrailingGoto) code ~= indent ~ "goto _defaultTokenFunction;\n"; return code; } /** * Implements the range primitive _front. */ ref const(Token) front()() pure nothrow const @property @safe { return _front; } /** * Advances the lexer to the next token and stores the new current token in * the _front variable. */ void _popFront()() pure nothrow @safe { advance(_front); } /** * Implements the range primitive _empty. */ bool empty()() pure const nothrow @property @safe @nogc { return _front.type == _tok!"\0"; } static string escape(string input) pure @trusted { string retVal; foreach (ubyte c; cast(ubyte[]) input) { switch (c) { case '\\': retVal ~= `\\`; break; case '"': retVal ~= `\"`; break; case '\'': retVal ~= `\'`; break; case '\t': retVal ~= `\t`; break; case '\n': retVal ~= `\n`; break; case '\r': retVal ~= `\r`; break; default: retVal ~= c; break; } } return retVal; } enum tokenSearch = generateStatements(); static ulong getFront(const ubyte[] arr) pure nothrow @trusted { static union ByteArr { ulong l; ubyte[8] arr; } static assert(ByteArr.sizeof == ulong.sizeof); ByteArr b; b.l = ulong.max; b.arr[0 .. arr.length] = arr[]; return b.l; } void advance(ref Token token) pure nothrow @trusted { if (range.index >= range.bytes.length) { token.type = _tok!"\0"; return; } immutable size_t index = range.index; immutable size_t column = range.column; immutable size_t line = range.line; immutable ulong frontBytes = range.index + 8 <= range.bytes.length ? getFront(range.bytes[range.index .. range.index + 8]) : getFront(range.bytes[range.index .. $]); ubyte f = cast(ubyte) frontBytes; // pragma(msg, tokenSearch); mixin(tokenSearch); _defaultTokenFunction: defaultTokenFunction(token); } /** * The lexer input. */ LexerRange range; /** * The token that is currently at the front of the range. */ Token _front; } /** * Range structure that wraps the _lexer's input. */ struct LexerRange { // TODO: When D gets @forceinline the template inline hack (i.e // `void front()() { ... }` )should be removed. public nothrow pure @safe @nogc: /** * Params: * bytes = the _lexer input * index = the initial offset from the beginning of $(D_PARAM bytes) * column = the initial _column number * line = the initial _line number */ this(const(ubyte)[] bytes, size_t index = 0, size_t column = 1, size_t line = 1) { this.bytes = bytes; this.index = index; this.column = column; this.line = line; } /** * Returns: a mark at the current position that can then be used with slice. */ size_t mark()() const { return index; } /** * Sets the range to the given position. * Params: m = the position to seek to */ void seek()(size_t m) { index = m; } /** * Returns a slice of the input byte array between the given mark and the * current position. * Params m = the beginning index of the slice to return */ const(ubyte)[] slice()(size_t m) const { return bytes[m .. index]; } /** * Implements the range primitive _empty. */ bool empty()() const { return index >= bytes.length; } /** * Implements the range primitive _front. */ ubyte front()() const { return bytes[index]; } /** * Returns: the current item as well as the items $(D_PARAM p) items ahead. */ const(ubyte)[] peek(size_t p) const { return index + p + 1 > bytes.length ? bytes[index .. $] : bytes[index .. index + p + 1]; } /** * Returns: true if the range starts with the given byte sequence */ bool startsWith(const(ubyte[]) needle) const { if (needle.length + index > bytes.length) return false; foreach (i; 0 .. needle.length) if (needle[i] != bytes[index + i]) return false; return true; } /** * */ ubyte peekAt()(size_t offset) const { return bytes[index + offset]; } /** * Returns: true if it is possible to peek $(D_PARAM p) bytes ahead. */ bool canPeek()(size_t p) const { return index + p < bytes.length; } /** * Implements the range primitive _popFront. */ void popFront()() { index++; column++; } /** * Implements the algorithm _popFrontN more efficiently. This function does * not detect or handle newlines. */ void popFrontN()(size_t n) { index += n; column += n; } /** * Increments the range's line number and resets the column counter. */ void incrementLine()(size_t i = 1) { column = 1; line += i; } /** * The input _bytes. */ const(ubyte)[] bytes; /** * The range's current position. */ size_t index; /** * The current _column number. */ size_t column; /** * The current _line number. */ size_t line; }