2025-07-10 19:10:17 +00:00
|
|
|
#include "tree_sitter/alloc.h"
|
|
|
|
#include "tree_sitter/parser.h"
|
2025-07-08 03:26:10 +00:00
|
|
|
|
2025-07-10 19:10:17 +00:00
|
|
|
#include <assert.h>
|
|
|
|
#include <string.h>
|
|
|
|
#include <wctype.h>
|
2025-07-08 03:26:10 +00:00
|
|
|
|
2025-07-10 19:10:17 +00:00
|
|
|
enum TokenType { RAW_STRING_DELIMITER, RAW_STRING_CONTENT };
|
2025-07-08 03:26:10 +00:00
|
|
|
|
2025-07-10 19:10:17 +00:00
|
|
|
/// The spec limits delimiters to 16 chars
|
|
|
|
#define MAX_DELIMITER_LENGTH 16
|
2025-07-08 03:26:10 +00:00
|
|
|
|
2025-07-10 19:10:17 +00:00
|
|
|
typedef struct {
|
|
|
|
uint8_t delimiter_length;
|
|
|
|
wchar_t delimiter[MAX_DELIMITER_LENGTH];
|
|
|
|
} Scanner;
|
2025-07-08 03:26:10 +00:00
|
|
|
|
2025-07-10 19:10:17 +00:00
|
|
|
static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
|
2025-07-08 03:26:10 +00:00
|
|
|
|
2025-07-10 19:10:17 +00:00
|
|
|
static inline void reset(Scanner *scanner) {
|
|
|
|
scanner->delimiter_length = 0;
|
|
|
|
memset(scanner->delimiter, 0, sizeof scanner->delimiter);
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Scan the raw string delimiter in R"delimiter(content)delimiter"
|
|
|
|
static bool scan_raw_string_delimiter(Scanner *scanner, TSLexer *lexer) {
|
|
|
|
if (scanner->delimiter_length > 0) {
|
|
|
|
// Closing delimiter: must exactly match the opening delimiter.
|
|
|
|
// We already checked this when scanning content, but this is how we
|
|
|
|
// know when to stop. We can't stop at ", because R"""hello""" is valid.
|
|
|
|
for (int i = 0; i < scanner->delimiter_length; ++i) {
|
|
|
|
if (lexer->lookahead != scanner->delimiter[i]) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
advance(lexer);
|
2025-07-08 03:26:10 +00:00
|
|
|
}
|
2025-07-10 19:10:17 +00:00
|
|
|
reset(scanner);
|
|
|
|
return true;
|
2025-07-08 03:26:10 +00:00
|
|
|
}
|
|
|
|
|
2025-07-10 19:10:17 +00:00
|
|
|
// Opening delimiter: record the d-char-sequence up to (.
|
|
|
|
// d-char is any basic character except parens, backslashes, and spaces.
|
2025-07-08 03:26:10 +00:00
|
|
|
for (;;) {
|
2025-07-10 19:10:17 +00:00
|
|
|
if (scanner->delimiter_length >= MAX_DELIMITER_LENGTH || lexer->eof(lexer) || lexer->lookahead == '\\' ||
|
|
|
|
iswspace(lexer->lookahead)) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if (lexer->lookahead == '(') {
|
|
|
|
// Rather than create a token for an empty delimiter, we fail and
|
|
|
|
// let the grammar fall back to a delimiter-less rule.
|
|
|
|
return scanner->delimiter_length > 0;
|
|
|
|
}
|
|
|
|
scanner->delimiter[scanner->delimiter_length++] = lexer->lookahead;
|
|
|
|
advance(lexer);
|
2025-07-08 03:26:10 +00:00
|
|
|
}
|
2025-07-10 19:10:17 +00:00
|
|
|
}
|
2025-07-08 03:26:10 +00:00
|
|
|
|
2025-07-10 19:10:17 +00:00
|
|
|
/// Scan the raw string content in R"delimiter(content)delimiter"
|
|
|
|
static bool scan_raw_string_content(Scanner *scanner, TSLexer *lexer) {
|
|
|
|
// The progress made through the delimiter since the last ')'.
|
|
|
|
// The delimiter may not contain ')' so a single counter suffices.
|
|
|
|
for (int delimiter_index = -1;;) {
|
|
|
|
// If we hit EOF, consider the content to terminate there.
|
|
|
|
// This forms an incomplete raw_string_literal, and models the code
|
|
|
|
// well.
|
|
|
|
if (lexer->eof(lexer)) {
|
|
|
|
lexer->mark_end(lexer);
|
2025-07-08 03:26:10 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2025-07-10 19:10:17 +00:00
|
|
|
if (delimiter_index >= 0) {
|
|
|
|
if (delimiter_index == scanner->delimiter_length) {
|
|
|
|
if (lexer->lookahead == '"') {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
delimiter_index = -1;
|
|
|
|
} else {
|
|
|
|
if (lexer->lookahead == scanner->delimiter[delimiter_index]) {
|
|
|
|
delimiter_index += 1;
|
|
|
|
} else {
|
|
|
|
delimiter_index = -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2025-07-08 03:26:10 +00:00
|
|
|
|
2025-07-10 19:10:17 +00:00
|
|
|
if (delimiter_index == -1 && lexer->lookahead == ')') {
|
|
|
|
// The content doesn't include the )delimiter" part.
|
|
|
|
// We must still scan through it, but exclude it from the token.
|
|
|
|
lexer->mark_end(lexer);
|
|
|
|
delimiter_index = 0;
|
|
|
|
}
|
2025-07-08 03:26:10 +00:00
|
|
|
|
2025-07-10 19:10:17 +00:00
|
|
|
advance(lexer);
|
|
|
|
}
|
2025-07-08 03:26:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
extern "C" {
|
|
|
|
|
|
|
|
void *tree_sitter_cpp_external_scanner_create() {
|
2025-07-10 19:10:17 +00:00
|
|
|
Scanner *scanner = (Scanner *)ts_calloc(1, sizeof(Scanner));
|
|
|
|
memset(scanner, 0, sizeof(Scanner));
|
|
|
|
return scanner;
|
2025-07-08 03:26:10 +00:00
|
|
|
}
|
|
|
|
|
2025-07-10 19:10:17 +00:00
|
|
|
bool tree_sitter_cpp_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
|
|
|
|
Scanner *scanner = (Scanner *)payload;
|
|
|
|
|
|
|
|
if (valid_symbols[RAW_STRING_DELIMITER] && valid_symbols[RAW_STRING_CONTENT]) {
|
|
|
|
// we're in error recovery
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// No skipping leading whitespace: raw-string grammar is space-sensitive.
|
|
|
|
if (valid_symbols[RAW_STRING_DELIMITER]) {
|
|
|
|
lexer->result_symbol = RAW_STRING_DELIMITER;
|
|
|
|
return scan_raw_string_delimiter(scanner, lexer);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (valid_symbols[RAW_STRING_CONTENT]) {
|
|
|
|
lexer->result_symbol = RAW_STRING_CONTENT;
|
|
|
|
return scan_raw_string_content(scanner, lexer);
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
2025-07-08 03:26:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
unsigned tree_sitter_cpp_external_scanner_serialize(void *payload, char *buffer) {
|
2025-07-10 19:10:17 +00:00
|
|
|
static_assert(MAX_DELIMITER_LENGTH * sizeof(wchar_t) < TREE_SITTER_SERIALIZATION_BUFFER_SIZE,
|
|
|
|
"Serialized delimiter is too long!");
|
|
|
|
|
|
|
|
Scanner *scanner = (Scanner *)payload;
|
|
|
|
size_t size = scanner->delimiter_length * sizeof(wchar_t);
|
|
|
|
memcpy(buffer, scanner->delimiter, size);
|
|
|
|
return (unsigned)size;
|
2025-07-08 03:26:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void tree_sitter_cpp_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
|
2025-07-10 19:10:17 +00:00
|
|
|
assert(length % sizeof(wchar_t) == 0 && "Can't decode serialized delimiter!");
|
|
|
|
|
|
|
|
Scanner *scanner = (Scanner *)payload;
|
|
|
|
scanner->delimiter_length = length / sizeof(wchar_t);
|
|
|
|
if (length > 0) {
|
|
|
|
memcpy(&scanner->delimiter[0], buffer, length);
|
|
|
|
}
|
2025-07-08 03:26:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void tree_sitter_cpp_external_scanner_destroy(void *payload) {
|
2025-07-10 19:10:17 +00:00
|
|
|
Scanner *scanner = (Scanner *)payload;
|
|
|
|
ts_free(scanner);
|
2025-07-08 03:26:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|