diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..452a28a --- /dev/null +++ b/.gitattributes @@ -0,0 +1,3 @@ +test-grammars/*/*.scm linguist-vendored +test-grammars/*/src/** linguist-vendored +test-grammars/*/src/{parser.c,grammar.json,scanner.*} binary diff --git a/.gitignore b/.gitignore index 7d155d4..bf27905 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ target result .direnv -test-grammars/ +/test-grammars/*/*.so +/test-grammars/*/.BUILD_COOKIE diff --git a/highlighter/src/tests.rs b/highlighter/src/tests.rs index fb7c524..f95b772 100644 --- a/highlighter/src/tests.rs +++ b/highlighter/src/tests.rs @@ -15,7 +15,6 @@ use crate::injections_query::{InjectionLanguageMarker, InjectionsQuery}; use crate::Language; static GRAMMARS: Lazy> = Lazy::new(|| { - fs::create_dir_all("../test-grammars").unwrap(); let skidder_config = skidder_config(); skidder::fetch(&skidder_config, false).unwrap(); skidder::build_all_grammars(&skidder_config, false, None).unwrap(); @@ -26,12 +25,11 @@ static GRAMMARS: Lazy> = Lazy::new(|| { fn skidder_config() -> skidder::Config { skidder::Config { - repos: vec![Repo::Git { - name: "helix-language-support".to_owned(), - remote: "git@github.com:helix-editor/tree-sitter-grammars.git".into(), - branch: "reversed".into(), + repos: vec![Repo::Local { + // `./test-grammars` in the root of the repo. + path: Path::new("../test-grammars").canonicalize().unwrap(), }], - index: Path::new("../test-grammars").canonicalize().unwrap(), + index: PathBuf::new(), verbose: true, } } diff --git a/test-grammars/comment/LICENSE b/test-grammars/comment/LICENSE new file mode 100644 index 0000000..8b03b18 --- /dev/null +++ b/test-grammars/comment/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 Santos Gallegos + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/test-grammars/comment/highlights.scm b/test-grammars/comment/highlights.scm new file mode 100644 index 0000000..56c1fd7 --- /dev/null +++ b/test-grammars/comment/highlights.scm @@ -0,0 +1,41 @@ +(tag + (name) @ui.text + (user)? @constant) + +; Hint level tags +((tag (name) @hint) + (#any-of? @hint "HINT" "MARK" "PASSED" "STUB" "MOCK")) + +("text" @hint + (#any-of? @hint "HINT" "MARK" "PASSED" "STUB" "MOCK")) + +; Info level tags +((tag (name) @info) + (#any-of? @info "INFO" "NOTE" "TODO" "PERF" "OPTIMIZE" "PERFORMANCE" "QUESTION" "ASK")) + +("text" @info + (#any-of? @info "INFO" "NOTE" "TODO" "PERF" "OPTIMIZE" "PERFORMANCE" "QUESTION" "ASK")) + +; Warning level tags +((tag (name) @warning) + (#any-of? @warning "HACK" "WARN" "WARNING" "TEST" "TEMP")) + +("text" @warning + (#any-of? @warning "HACK" "WARN" "WARNING" "TEST" "TEMP")) + +; Error level tags +((tag (name) @error) + (#any-of? @error "BUG" "FIXME" "ISSUE" "XXX" "FIX" "SAFETY" "FIXIT" "FAILED" "DEBUG" "INVARIANT" "COMPLIANCE")) + +("text" @error + (#any-of? @error "BUG" "FIXME" "ISSUE" "XXX" "FIX" "SAFETY" "FIXIT" "FAILED" "DEBUG" "INVARIANT" "COMPLIANCE")) + +; Issue number (#123) +("text" @constant.numeric + (#match? @constant.numeric "^#[0-9]+$")) + +; User mention (@user) +("text" @tag + (#match? @tag "^[@][a-zA-Z0-9_-]+$")) + +(uri) @markup.link.url diff --git a/test-grammars/comment/metadata.json b/test-grammars/comment/metadata.json new file mode 100644 index 0000000..3c3a6e1 --- /dev/null +++ b/test-grammars/comment/metadata.json @@ -0,0 +1,6 @@ +{ + "repo": "https://github.com/stsewd/tree-sitter-comment", + "rev": "aefcc2813392eb6ffe509aa0fc8b4e9b57413ee1", + "license": "MIT", + "compressed": true +} \ No newline at end of file diff --git a/test-grammars/comment/src/grammar.json b/test-grammars/comment/src/grammar.json new file mode 100644 index 0000000..de26a9e Binary files /dev/null and b/test-grammars/comment/src/grammar.json differ diff --git a/test-grammars/comment/src/parser.c b/test-grammars/comment/src/parser.c new file mode 100644 index 0000000..0713ce0 Binary files /dev/null and b/test-grammars/comment/src/parser.c differ diff --git a/test-grammars/comment/src/scanner.c b/test-grammars/comment/src/scanner.c new file mode 100644 index 0000000..d8b0e24 --- /dev/null +++ b/test-grammars/comment/src/scanner.c @@ -0,0 +1,35 @@ +#include + +#include "tree_sitter_comment/parser.c" +#include "tree_sitter_comment/tokens.h" + +void* tree_sitter_comment_external_scanner_create() +{ + return NULL; +} + +void tree_sitter_comment_external_scanner_destroy(void* payload) +{ +} + +unsigned tree_sitter_comment_external_scanner_serialize( + void* payload, + char* buffer) +{ + return 0; +} + +void tree_sitter_comment_external_scanner_deserialize( + void* payload, + const char* buffer, + unsigned length) +{ +} + +bool tree_sitter_comment_external_scanner_scan( + void* payload, + TSLexer* lexer, + const bool* valid_symbols) +{ + return parse(lexer, valid_symbols); +} diff --git a/test-grammars/comment/src/tree_sitter/parser.h b/test-grammars/comment/src/tree_sitter/parser.h new file mode 100644 index 0000000..2b14ac1 --- /dev/null +++ b/test-grammars/comment/src/tree_sitter/parser.h @@ -0,0 +1,224 @@ +#ifndef TREE_SITTER_PARSER_H_ +#define TREE_SITTER_PARSER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +#define ts_builtin_sym_error ((TSSymbol)-1) +#define ts_builtin_sym_end 0 +#define TREE_SITTER_SERIALIZATION_BUFFER_SIZE 1024 + +typedef uint16_t TSStateId; + +#ifndef TREE_SITTER_API_H_ +typedef uint16_t TSSymbol; +typedef uint16_t TSFieldId; +typedef struct TSLanguage TSLanguage; +#endif + +typedef struct { + TSFieldId field_id; + uint8_t child_index; + bool inherited; +} TSFieldMapEntry; + +typedef struct { + uint16_t index; + uint16_t length; +} TSFieldMapSlice; + +typedef struct { + bool visible; + bool named; + bool supertype; +} TSSymbolMetadata; + +typedef struct TSLexer TSLexer; + +struct TSLexer { + int32_t lookahead; + TSSymbol result_symbol; + void (*advance)(TSLexer *, bool); + void (*mark_end)(TSLexer *); + uint32_t (*get_column)(TSLexer *); + bool (*is_at_included_range_start)(const TSLexer *); + bool (*eof)(const TSLexer *); +}; + +typedef enum { + TSParseActionTypeShift, + TSParseActionTypeReduce, + TSParseActionTypeAccept, + TSParseActionTypeRecover, +} TSParseActionType; + +typedef union { + struct { + uint8_t type; + TSStateId state; + bool extra; + bool repetition; + } shift; + struct { + uint8_t type; + uint8_t child_count; + TSSymbol symbol; + int16_t dynamic_precedence; + uint16_t production_id; + } reduce; + uint8_t type; +} TSParseAction; + +typedef struct { + uint16_t lex_state; + uint16_t external_lex_state; +} TSLexMode; + +typedef union { + TSParseAction action; + struct { + uint8_t count; + bool reusable; + } entry; +} TSParseActionEntry; + +struct TSLanguage { + uint32_t version; + uint32_t symbol_count; + uint32_t alias_count; + uint32_t token_count; + uint32_t external_token_count; + uint32_t state_count; + uint32_t large_state_count; + uint32_t production_id_count; + uint32_t field_count; + uint16_t max_alias_sequence_length; + const uint16_t *parse_table; + const uint16_t *small_parse_table; + const uint32_t *small_parse_table_map; + const TSParseActionEntry *parse_actions; + const char * const *symbol_names; + const char * const *field_names; + const TSFieldMapSlice *field_map_slices; + const TSFieldMapEntry *field_map_entries; + const TSSymbolMetadata *symbol_metadata; + const TSSymbol *public_symbol_map; + const uint16_t *alias_map; + const TSSymbol *alias_sequences; + const TSLexMode *lex_modes; + bool (*lex_fn)(TSLexer *, TSStateId); + bool (*keyword_lex_fn)(TSLexer *, TSStateId); + TSSymbol keyword_capture_token; + struct { + const bool *states; + const TSSymbol *symbol_map; + void *(*create)(void); + void (*destroy)(void *); + bool (*scan)(void *, TSLexer *, const bool *symbol_whitelist); + unsigned (*serialize)(void *, char *); + void (*deserialize)(void *, const char *, unsigned); + } external_scanner; + const TSStateId *primary_state_ids; +}; + +/* + * Lexer Macros + */ + +#define START_LEXER() \ + bool result = false; \ + bool skip = false; \ + bool eof = false; \ + int32_t lookahead; \ + goto start; \ + next_state: \ + lexer->advance(lexer, skip); \ + start: \ + skip = false; \ + lookahead = lexer->lookahead; + +#define ADVANCE(state_value) \ + { \ + state = state_value; \ + goto next_state; \ + } + +#define SKIP(state_value) \ + { \ + skip = true; \ + state = state_value; \ + goto next_state; \ + } + +#define ACCEPT_TOKEN(symbol_value) \ + result = true; \ + lexer->result_symbol = symbol_value; \ + lexer->mark_end(lexer); + +#define END_STATE() return result; + +/* + * Parse Table Macros + */ + +#define SMALL_STATE(id) id - LARGE_STATE_COUNT + +#define STATE(id) id + +#define ACTIONS(id) id + +#define SHIFT(state_value) \ + {{ \ + .shift = { \ + .type = TSParseActionTypeShift, \ + .state = state_value \ + } \ + }} + +#define SHIFT_REPEAT(state_value) \ + {{ \ + .shift = { \ + .type = TSParseActionTypeShift, \ + .state = state_value, \ + .repetition = true \ + } \ + }} + +#define SHIFT_EXTRA() \ + {{ \ + .shift = { \ + .type = TSParseActionTypeShift, \ + .extra = true \ + } \ + }} + +#define REDUCE(symbol_val, child_count_val, ...) \ + {{ \ + .reduce = { \ + .type = TSParseActionTypeReduce, \ + .symbol = symbol_val, \ + .child_count = child_count_val, \ + __VA_ARGS__ \ + }, \ + }} + +#define RECOVER() \ + {{ \ + .type = TSParseActionTypeRecover \ + }} + +#define ACCEPT_INPUT() \ + {{ \ + .type = TSParseActionTypeAccept \ + }} + +#ifdef __cplusplus +} +#endif + +#endif // TREE_SITTER_PARSER_H_ diff --git a/test-grammars/comment/src/tree_sitter_comment/chars.c b/test-grammars/comment/src/tree_sitter_comment/chars.c new file mode 100644 index 0000000..85c0973 --- /dev/null +++ b/test-grammars/comment/src/tree_sitter_comment/chars.c @@ -0,0 +1,66 @@ +#include "chars.h" + +bool is_upper(int32_t c) +{ + const int32_t upper = 65; + const int32_t lower = 90; + return c >= upper && c <= lower; +} + +bool is_digit(int32_t c) +{ + const int32_t upper = 48; + const int32_t lower = 57; + return c >= upper && c <= lower; +} + +bool is_newline(int32_t c) +{ + const int32_t newline_chars[] = { + CHAR_EOF, + CHAR_NEWLINE, + CHAR_CARRIAGE_RETURN, + }; + const int length = sizeof(newline_chars) / sizeof(int32_t); + for (int i = 0; i < length; i++) { + if (c == newline_chars[i]) { + return true; + } + } + return false; +} + +bool is_space(int32_t c) +{ + const int32_t space_chars[] = { + CHAR_SPACE, + CHAR_FORM_FEED, + CHAR_TAB, + CHAR_VERTICAL_TAB, + }; + const int length = sizeof(space_chars) / sizeof(int32_t); + bool is_space_char = false; + for (int i = 0; i < length; i++) { + if (c == space_chars[i]) { + is_space_char = true; + break; + } + } + return is_space_char || is_newline(c); +} + +/// Check if the character is allowed inside the name. +bool is_internal_char(int32_t c) +{ + const int32_t valid_chars[] = { + '-', + '_', + }; + const int length = sizeof(valid_chars) / sizeof(int32_t); + for (int i = 0; i < length; i++) { + if (c == valid_chars[i]) { + return true; + } + } + return false; +} diff --git a/test-grammars/comment/src/tree_sitter_comment/chars.h b/test-grammars/comment/src/tree_sitter_comment/chars.h new file mode 100644 index 0000000..fa5ad3a --- /dev/null +++ b/test-grammars/comment/src/tree_sitter_comment/chars.h @@ -0,0 +1,22 @@ +#ifndef TREE_SITTER_COMMENT_CHARS_H +#define TREE_SITTER_COMMENT_CHARS_H + +#include +#include + +#define CHAR_EOF 0 +#define CHAR_NEWLINE 10 +#define CHAR_CARRIAGE_RETURN 13 + +#define CHAR_SPACE ' ' +#define CHAR_FORM_FEED '\f' +#define CHAR_TAB '\t' +#define CHAR_VERTICAL_TAB '\v' + +bool is_internal_char(int32_t c); +bool is_newline(int32_t c); +bool is_space(int32_t c); +bool is_upper(int32_t c); +bool is_digit(int32_t c); + +#endif /* ifndef TREE_SITTER_COMMENT_CHARS_H */ diff --git a/test-grammars/comment/src/tree_sitter_comment/parser.c b/test-grammars/comment/src/tree_sitter_comment/parser.c new file mode 100644 index 0000000..89eb616 --- /dev/null +++ b/test-grammars/comment/src/tree_sitter_comment/parser.c @@ -0,0 +1,97 @@ +#include "parser.h" + +#include "chars.c" +#include "tokens.h" +#include +#include + +/// Parse the name of the tag. +/// +/// They can be of the form: +/// - TODO: +/// - TODO: text +/// - TODO(stsewd): +/// - TODO(stsewd): text +/// - TODO (stsewd): text +bool parse_tagname(TSLexer* lexer, const bool* valid_symbols) +{ + if (!is_upper(lexer->lookahead) || !valid_symbols[T_TAGNAME]) { + return false; + } + + int32_t previous = lexer->lookahead; + lexer->advance(lexer, false); + + while (is_upper(lexer->lookahead) + || is_digit(lexer->lookahead) + || is_internal_char(lexer->lookahead)) { + previous = lexer->lookahead; + lexer->advance(lexer, false); + } + // The tag name ends here. + // But we keep parsing to see if it's a valid tag name. + lexer->mark_end(lexer); + + // It can't end with an internal char. + if (is_internal_char(previous)) { + return false; + } + + // For the user component this is `\s*(`. + // We don't parse that part, we just need to be sure it ends with `:\s`. + if ((is_space(lexer->lookahead) && !is_newline(lexer->lookahead)) + || lexer->lookahead == '(') { + // Skip white spaces. + while (is_space(lexer->lookahead) && !is_newline(lexer->lookahead)) { + lexer->advance(lexer, false); + } + // Checking aperture. + if (lexer->lookahead != '(') { + return false; + } + lexer->advance(lexer, false); + + // Checking closure. + int user_length = 0; + while (lexer->lookahead != ')') { + if (is_newline(lexer->lookahead)) { + return false; + } + lexer->advance(lexer, false); + user_length++; + } + if (user_length <= 0) { + return false; + } + lexer->advance(lexer, false); + } + + // It should end with `:`... + if (lexer->lookahead != ':') { + return false; + } + + // ... and be followed by one space. + lexer->advance(lexer, false); + if (!is_space(lexer->lookahead)) { + return false; + } + + lexer->result_symbol = T_TAGNAME; + return true; +} + +bool parse(TSLexer* lexer, const bool* valid_symbols) +{ + // If all valid symbols are true, tree-sitter is in correction mode. + // We don't want to parse anything in that case. + if (valid_symbols[T_INVALID_TOKEN]) { + return false; + } + + if (is_upper(lexer->lookahead) && valid_symbols[T_TAGNAME]) { + return parse_tagname(lexer, valid_symbols); + } + + return false; +} diff --git a/test-grammars/comment/src/tree_sitter_comment/parser.h b/test-grammars/comment/src/tree_sitter_comment/parser.h new file mode 100644 index 0000000..9c89dac --- /dev/null +++ b/test-grammars/comment/src/tree_sitter_comment/parser.h @@ -0,0 +1,9 @@ +#ifndef TREE_SITTER_COMMENT_PARSER_H +#define TREE_SITTER_COMMENT_PARSER_H + +#include + +bool parse_tagname(TSLexer* lexer, const bool* valid_symbols); +bool parse(TSLexer* lexer, const bool* valid_symbols); + +#endif /* ifndef TREE_SITTER_COMMENT_PARSER_H */ diff --git a/test-grammars/comment/src/tree_sitter_comment/tokens.h b/test-grammars/comment/src/tree_sitter_comment/tokens.h new file mode 100644 index 0000000..b165641 --- /dev/null +++ b/test-grammars/comment/src/tree_sitter_comment/tokens.h @@ -0,0 +1,9 @@ +#ifndef TREE_SITTER_COMMENT_TOKENS_H +#define TREE_SITTER_COMMENT_TOKENS_H + +enum TokenType { + T_TAGNAME, + T_INVALID_TOKEN, +}; + +#endif /* ifndef TREE_SITTER_COMMENT_TOKENS_H */ diff --git a/test-grammars/html/LICENSE b/test-grammars/html/LICENSE new file mode 100644 index 0000000..4b52d19 --- /dev/null +++ b/test-grammars/html/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2014 Max Brunsfeld + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/test-grammars/html/highlights.scm b/test-grammars/html/highlights.scm new file mode 100644 index 0000000..8581f0a --- /dev/null +++ b/test-grammars/html/highlights.scm @@ -0,0 +1,47 @@ +(tag_name) @tag +(erroneous_end_tag_name) @error +(doctype) @constant +(attribute_name) @attribute + +(attribute [(attribute_value) (quoted_attribute_value)] @string) + +((attribute + (attribute_name) @attribute + (quoted_attribute_value (attribute_value) @markup.link.url)) + (#any-of? @attribute "href" "src")) + +((element + (start_tag + (tag_name) @_tag) + (text) @markup.link.label) + (#eq? @_tag "a")) + +((element + (start_tag + (tag_name) @_tag) + (text) @markup.bold) + (#any-of? @_tag "strong" "b")) + +((element + (start_tag + (tag_name) @_tag) + (text) @markup.italic) + (#any-of? @_tag "em" "i")) + +((element + (start_tag + (tag_name) @_tag) + (text) @markup.strikethrough) + (#any-of? @_tag "s" "del")) + +[ + "<" + ">" + "" + " +#include +#include +#include +#include +#include +#include "tag.h" + +namespace { + +using std::vector; +using std::string; + +enum TokenType { + START_TAG_NAME, + SCRIPT_START_TAG_NAME, + STYLE_START_TAG_NAME, + END_TAG_NAME, + ERRONEOUS_END_TAG_NAME, + SELF_CLOSING_TAG_DELIMITER, + IMPLICIT_END_TAG, + RAW_TEXT, + COMMENT +}; + +struct Scanner { + Scanner() {} + + unsigned serialize(char *buffer) { + uint16_t tag_count = tags.size() > UINT16_MAX ? UINT16_MAX : tags.size(); + uint16_t serialized_tag_count = 0; + + unsigned i = sizeof(tag_count); + std::memcpy(&buffer[i], &tag_count, sizeof(tag_count)); + i += sizeof(tag_count); + + for (; serialized_tag_count < tag_count; serialized_tag_count++) { + Tag &tag = tags[serialized_tag_count]; + if (tag.type == CUSTOM) { + unsigned name_length = tag.custom_tag_name.size(); + if (name_length > UINT8_MAX) name_length = UINT8_MAX; + if (i + 2 + name_length >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) break; + buffer[i++] = static_cast(tag.type); + buffer[i++] = name_length; + tag.custom_tag_name.copy(&buffer[i], name_length); + i += name_length; + } else { + if (i + 1 >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) break; + buffer[i++] = static_cast(tag.type); + } + } + + std::memcpy(&buffer[0], &serialized_tag_count, sizeof(serialized_tag_count)); + return i; + } + + void deserialize(const char *buffer, unsigned length) { + tags.clear(); + if (length > 0) { + unsigned i = 0; + uint16_t tag_count, serialized_tag_count; + + std::memcpy(&serialized_tag_count, &buffer[i], sizeof(serialized_tag_count)); + i += sizeof(serialized_tag_count); + + std::memcpy(&tag_count, &buffer[i], sizeof(tag_count)); + i += sizeof(tag_count); + + tags.resize(tag_count); + for (unsigned j = 0; j < serialized_tag_count; j++) { + Tag &tag = tags[j]; + tag.type = static_cast(buffer[i++]); + if (tag.type == CUSTOM) { + uint16_t name_length = static_cast(buffer[i++]); + tag.custom_tag_name.assign(&buffer[i], &buffer[i + name_length]); + i += name_length; + } + } + } + } + + string scan_tag_name(TSLexer *lexer) { + string tag_name; + while (iswalnum(lexer->lookahead) || + lexer->lookahead == '-' || + lexer->lookahead == ':') { + tag_name += towupper(lexer->lookahead); + lexer->advance(lexer, false); + } + return tag_name; + } + + bool scan_comment(TSLexer *lexer) { + if (lexer->lookahead != '-') return false; + lexer->advance(lexer, false); + if (lexer->lookahead != '-') return false; + lexer->advance(lexer, false); + + unsigned dashes = 0; + while (lexer->lookahead) { + switch (lexer->lookahead) { + case '-': + ++dashes; + break; + case '>': + if (dashes >= 2) { + lexer->result_symbol = COMMENT; + lexer->advance(lexer, false); + lexer->mark_end(lexer); + return true; + } + default: + dashes = 0; + } + lexer->advance(lexer, false); + } + return false; + } + + bool scan_raw_text(TSLexer *lexer) { + if (!tags.size()) return false; + + lexer->mark_end(lexer); + + const string &end_delimiter = tags.back().type == SCRIPT + ? "lookahead) { + if (towupper(lexer->lookahead) == end_delimiter[delimiter_index]) { + delimiter_index++; + if (delimiter_index == end_delimiter.size()) break; + lexer->advance(lexer, false); + } else { + delimiter_index = 0; + lexer->advance(lexer, false); + lexer->mark_end(lexer); + } + } + + lexer->result_symbol = RAW_TEXT; + return true; + } + + bool scan_implicit_end_tag(TSLexer *lexer) { + Tag *parent = tags.empty() ? NULL : &tags.back(); + + bool is_closing_tag = false; + if (lexer->lookahead == '/') { + is_closing_tag = true; + lexer->advance(lexer, false); + } else { + if (parent && parent->is_void()) { + tags.pop_back(); + lexer->result_symbol = IMPLICIT_END_TAG; + return true; + } + } + + string tag_name = scan_tag_name(lexer); + if (tag_name.empty()) return false; + + Tag next_tag = Tag::for_name(tag_name); + + if (is_closing_tag) { + // The tag correctly closes the topmost element on the stack + if (!tags.empty() && tags.back() == next_tag) return false; + + // Otherwise, dig deeper and queue implicit end tags (to be nice in + // the case of malformed HTML) + if (std::find(tags.begin(), tags.end(), next_tag) != tags.end()) { + tags.pop_back(); + lexer->result_symbol = IMPLICIT_END_TAG; + return true; + } + } else if (parent && !parent->can_contain(next_tag)) { + tags.pop_back(); + lexer->result_symbol = IMPLICIT_END_TAG; + return true; + } + + return false; + } + + bool scan_start_tag_name(TSLexer *lexer) { + string tag_name = scan_tag_name(lexer); + if (tag_name.empty()) return false; + Tag tag = Tag::for_name(tag_name); + tags.push_back(tag); + switch (tag.type) { + case SCRIPT: + lexer->result_symbol = SCRIPT_START_TAG_NAME; + break; + case STYLE: + lexer->result_symbol = STYLE_START_TAG_NAME; + break; + default: + lexer->result_symbol = START_TAG_NAME; + break; + } + return true; + } + + bool scan_end_tag_name(TSLexer *lexer) { + string tag_name = scan_tag_name(lexer); + if (tag_name.empty()) return false; + Tag tag = Tag::for_name(tag_name); + if (!tags.empty() && tags.back() == tag) { + tags.pop_back(); + lexer->result_symbol = END_TAG_NAME; + } else { + lexer->result_symbol = ERRONEOUS_END_TAG_NAME; + } + return true; + } + + bool scan_self_closing_tag_delimiter(TSLexer *lexer) { + lexer->advance(lexer, false); + if (lexer->lookahead == '>') { + lexer->advance(lexer, false); + if (!tags.empty()) { + tags.pop_back(); + lexer->result_symbol = SELF_CLOSING_TAG_DELIMITER; + } + return true; + } + return false; + } + + bool scan(TSLexer *lexer, const bool *valid_symbols) { + while (iswspace(lexer->lookahead)) { + lexer->advance(lexer, true); + } + + if (valid_symbols[RAW_TEXT] && !valid_symbols[START_TAG_NAME] && !valid_symbols[END_TAG_NAME]) { + return scan_raw_text(lexer); + } + + switch (lexer->lookahead) { + case '<': + lexer->mark_end(lexer); + lexer->advance(lexer, false); + + if (lexer->lookahead == '!') { + lexer->advance(lexer, false); + return scan_comment(lexer); + } + + if (valid_symbols[IMPLICIT_END_TAG]) { + return scan_implicit_end_tag(lexer); + } + break; + + case '\0': + if (valid_symbols[IMPLICIT_END_TAG]) { + return scan_implicit_end_tag(lexer); + } + break; + + case '/': + if (valid_symbols[SELF_CLOSING_TAG_DELIMITER]) { + return scan_self_closing_tag_delimiter(lexer); + } + break; + + default: + if ((valid_symbols[START_TAG_NAME] || valid_symbols[END_TAG_NAME]) && !valid_symbols[RAW_TEXT]) { + return valid_symbols[START_TAG_NAME] + ? scan_start_tag_name(lexer) + : scan_end_tag_name(lexer); + } + } + + return false; + } + + vector tags; +}; + +} + +extern "C" { + +void *tree_sitter_html_external_scanner_create() { + return new Scanner(); +} + +bool tree_sitter_html_external_scanner_scan(void *payload, TSLexer *lexer, + const bool *valid_symbols) { + Scanner *scanner = static_cast(payload); + return scanner->scan(lexer, valid_symbols); +} + +unsigned tree_sitter_html_external_scanner_serialize(void *payload, char *buffer) { + Scanner *scanner = static_cast(payload); + return scanner->serialize(buffer); +} + +void tree_sitter_html_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) { + Scanner *scanner = static_cast(payload); + scanner->deserialize(buffer, length); +} + +void tree_sitter_html_external_scanner_destroy(void *payload) { + Scanner *scanner = static_cast(payload); + delete scanner; +} + +} diff --git a/test-grammars/html/src/tag.h b/test-grammars/html/src/tag.h new file mode 100644 index 0000000..b068eca --- /dev/null +++ b/test-grammars/html/src/tag.h @@ -0,0 +1,380 @@ +#include +#include + +using std::string; +using std::map; + +enum TagType { + AREA, + BASE, + BASEFONT, + BGSOUND, + BR, + COL, + COMMAND, + EMBED, + FRAME, + HR, + IMAGE, + IMG, + INPUT, + ISINDEX, + KEYGEN, + LINK, + MENUITEM, + META, + NEXTID, + PARAM, + SOURCE, + TRACK, + WBR, + END_OF_VOID_TAGS, + + A, + ABBR, + ADDRESS, + ARTICLE, + ASIDE, + AUDIO, + B, + BDI, + BDO, + BLOCKQUOTE, + BODY, + BUTTON, + CANVAS, + CAPTION, + CITE, + CODE, + COLGROUP, + DATA, + DATALIST, + DD, + DEL, + DETAILS, + DFN, + DIALOG, + DIV, + DL, + DT, + EM, + FIELDSET, + FIGCAPTION, + FIGURE, + FOOTER, + FORM, + H1, + H2, + H3, + H4, + H5, + H6, + HEAD, + HEADER, + HGROUP, + HTML, + I, + IFRAME, + INS, + KBD, + LABEL, + LEGEND, + LI, + MAIN, + MAP, + MARK, + MATH, + MENU, + METER, + NAV, + NOSCRIPT, + OBJECT, + OL, + OPTGROUP, + OPTION, + OUTPUT, + P, + PICTURE, + PRE, + PROGRESS, + Q, + RB, + RP, + RT, + RTC, + RUBY, + S, + SAMP, + SCRIPT, + SECTION, + SELECT, + SLOT, + SMALL, + SPAN, + STRONG, + STYLE, + SUB, + SUMMARY, + SUP, + SVG, + TABLE, + TBODY, + TD, + TEMPLATE, + TEXTAREA, + TFOOT, + TH, + THEAD, + TIME, + TITLE, + TR, + U, + UL, + VAR, + VIDEO, + + CUSTOM, +}; + + +static const map get_tag_map() { + map result; +#define TAG(name) result[#name] = name + TAG(AREA); + TAG(BASE); + TAG(BASEFONT); + TAG(BGSOUND); + TAG(BR); + TAG(COL); + TAG(COMMAND); + TAG(EMBED); + TAG(FRAME); + TAG(HR); + TAG(IMAGE); + TAG(IMG); + TAG(INPUT); + TAG(ISINDEX); + TAG(KEYGEN); + TAG(LINK); + TAG(MENUITEM); + TAG(META); + TAG(NEXTID); + TAG(PARAM); + TAG(SOURCE); + TAG(TRACK); + TAG(WBR); + TAG(A); + TAG(ABBR); + TAG(ADDRESS); + TAG(ARTICLE); + TAG(ASIDE); + TAG(AUDIO); + TAG(B); + TAG(BDI); + TAG(BDO); + TAG(BLOCKQUOTE); + TAG(BODY); + TAG(BUTTON); + TAG(CANVAS); + TAG(CAPTION); + TAG(CITE); + TAG(CODE); + TAG(COLGROUP); + TAG(DATA); + TAG(DATALIST); + TAG(DD); + TAG(DEL); + TAG(DETAILS); + TAG(DFN); + TAG(DIALOG); + TAG(DIV); + TAG(DL); + TAG(DT); + TAG(EM); + TAG(FIELDSET); + TAG(FIGCAPTION); + TAG(FIGURE); + TAG(FOOTER); + TAG(FORM); + TAG(H1); + TAG(H2); + TAG(H3); + TAG(H4); + TAG(H5); + TAG(H6); + TAG(HEAD); + TAG(HEADER); + TAG(HGROUP); + TAG(HTML); + TAG(I); + TAG(IFRAME); + TAG(INS); + TAG(KBD); + TAG(LABEL); + TAG(LEGEND); + TAG(LI); + TAG(MAIN); + TAG(MAP); + TAG(MARK); + TAG(MATH); + TAG(MENU); + TAG(METER); + TAG(NAV); + TAG(NOSCRIPT); + TAG(OBJECT); + TAG(OL); + TAG(OPTGROUP); + TAG(OPTION); + TAG(OUTPUT); + TAG(P); + TAG(PICTURE); + TAG(PRE); + TAG(PROGRESS); + TAG(Q); + TAG(RB); + TAG(RP); + TAG(RT); + TAG(RTC); + TAG(RUBY); + TAG(S); + TAG(SAMP); + TAG(SCRIPT); + TAG(SECTION); + TAG(SELECT); + TAG(SLOT); + TAG(SMALL); + TAG(SPAN); + TAG(STRONG); + TAG(STYLE); + TAG(SUB); + TAG(SUMMARY); + TAG(SUP); + TAG(SVG); + TAG(TABLE); + TAG(TBODY); + TAG(TD); + TAG(TEMPLATE); + TAG(TEXTAREA); + TAG(TFOOT); + TAG(TH); + TAG(THEAD); + TAG(TIME); + TAG(TITLE); + TAG(TR); + TAG(U); + TAG(UL); + TAG(VAR); + TAG(VIDEO); +#undef TAG + return result; +} + +static const map TAG_TYPES_BY_TAG_NAME = get_tag_map(); + +static const TagType TAG_TYPES_NOT_ALLOWED_IN_PARAGRAPHS[] = { + ADDRESS, + ARTICLE, + ASIDE, + BLOCKQUOTE, + DETAILS, + DIV, + DL, + FIELDSET, + FIGCAPTION, + FIGURE, + FOOTER, + FORM, + H1, + H2, + H3, + H4, + H5, + H6, + HEADER, + HR, + MAIN, + NAV, + OL, + P, + PRE, + SECTION, +}; + +static const TagType *TAG_TYPES_NOT_ALLOWED_IN_PARAGRAPHS_END = ( + TAG_TYPES_NOT_ALLOWED_IN_PARAGRAPHS + + sizeof(TAG_TYPES_NOT_ALLOWED_IN_PARAGRAPHS) / + sizeof(TagType) +); + +struct Tag { + TagType type; + string custom_tag_name; + + // This default constructor is used in the case where there is not enough space + // in the serialization buffer to store all of the tags. In that case, tags + // that cannot be serialized will be treated as having an unknown type. These + // tags will be closed via implicit end tags regardless of the next closing + // tag is encountered. + Tag() : type(END_OF_VOID_TAGS) {} + + Tag(TagType type, const string &name) : type(type), custom_tag_name(name) {} + + bool operator==(const Tag &other) const { + if (type != other.type) return false; + if (type == CUSTOM && custom_tag_name != other.custom_tag_name) return false; + return true; + } + + inline bool is_void() const { + return type < END_OF_VOID_TAGS; + } + + inline bool can_contain(const Tag &tag) { + TagType child = tag.type; + + switch (type) { + case LI: return child != LI; + + case DT: + case DD: + return child != DT && child != DD; + + case P: + return std::find( + TAG_TYPES_NOT_ALLOWED_IN_PARAGRAPHS, + TAG_TYPES_NOT_ALLOWED_IN_PARAGRAPHS_END, + tag.type + ) == TAG_TYPES_NOT_ALLOWED_IN_PARAGRAPHS_END; + + case COLGROUP: + return child == COL; + + case RB: + case RT: + case RP: + return child != RB && child != RT && child != RP; + + case OPTGROUP: + return child != OPTGROUP; + + case TR: + return child != TR; + + case TD: + case TH: + return child != TD && child != TH && child != TR; + + default: + return true; + } + } + + static inline Tag for_name(const string &name) { + map::const_iterator type = TAG_TYPES_BY_TAG_NAME.find(name); + if (type != TAG_TYPES_BY_TAG_NAME.end()) { + return Tag(type->second, string()); + } else { + return Tag(CUSTOM, name); + } + } +}; diff --git a/test-grammars/html/src/tree_sitter/parser.h b/test-grammars/html/src/tree_sitter/parser.h new file mode 100644 index 0000000..cbbc7b4 --- /dev/null +++ b/test-grammars/html/src/tree_sitter/parser.h @@ -0,0 +1,223 @@ +#ifndef TREE_SITTER_PARSER_H_ +#define TREE_SITTER_PARSER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +#define ts_builtin_sym_error ((TSSymbol)-1) +#define ts_builtin_sym_end 0 +#define TREE_SITTER_SERIALIZATION_BUFFER_SIZE 1024 + +typedef uint16_t TSStateId; + +#ifndef TREE_SITTER_API_H_ +typedef uint16_t TSSymbol; +typedef uint16_t TSFieldId; +typedef struct TSLanguage TSLanguage; +#endif + +typedef struct { + TSFieldId field_id; + uint8_t child_index; + bool inherited; +} TSFieldMapEntry; + +typedef struct { + uint16_t index; + uint16_t length; +} TSFieldMapSlice; + +typedef struct { + bool visible; + bool named; + bool supertype; +} TSSymbolMetadata; + +typedef struct TSLexer TSLexer; + +struct TSLexer { + int32_t lookahead; + TSSymbol result_symbol; + void (*advance)(TSLexer *, bool); + void (*mark_end)(TSLexer *); + uint32_t (*get_column)(TSLexer *); + bool (*is_at_included_range_start)(const TSLexer *); + bool (*eof)(const TSLexer *); +}; + +typedef enum { + TSParseActionTypeShift, + TSParseActionTypeReduce, + TSParseActionTypeAccept, + TSParseActionTypeRecover, +} TSParseActionType; + +typedef union { + struct { + uint8_t type; + TSStateId state; + bool extra; + bool repetition; + } shift; + struct { + uint8_t type; + uint8_t child_count; + TSSymbol symbol; + int16_t dynamic_precedence; + uint16_t production_id; + } reduce; + uint8_t type; +} TSParseAction; + +typedef struct { + uint16_t lex_state; + uint16_t external_lex_state; +} TSLexMode; + +typedef union { + TSParseAction action; + struct { + uint8_t count; + bool reusable; + } entry; +} TSParseActionEntry; + +struct TSLanguage { + uint32_t version; + uint32_t symbol_count; + uint32_t alias_count; + uint32_t token_count; + uint32_t external_token_count; + uint32_t state_count; + uint32_t large_state_count; + uint32_t production_id_count; + uint32_t field_count; + uint16_t max_alias_sequence_length; + const uint16_t *parse_table; + const uint16_t *small_parse_table; + const uint32_t *small_parse_table_map; + const TSParseActionEntry *parse_actions; + const char * const *symbol_names; + const char * const *field_names; + const TSFieldMapSlice *field_map_slices; + const TSFieldMapEntry *field_map_entries; + const TSSymbolMetadata *symbol_metadata; + const TSSymbol *public_symbol_map; + const uint16_t *alias_map; + const TSSymbol *alias_sequences; + const TSLexMode *lex_modes; + bool (*lex_fn)(TSLexer *, TSStateId); + bool (*keyword_lex_fn)(TSLexer *, TSStateId); + TSSymbol keyword_capture_token; + struct { + const bool *states; + const TSSymbol *symbol_map; + void *(*create)(void); + void (*destroy)(void *); + bool (*scan)(void *, TSLexer *, const bool *symbol_whitelist); + unsigned (*serialize)(void *, char *); + void (*deserialize)(void *, const char *, unsigned); + } external_scanner; +}; + +/* + * Lexer Macros + */ + +#define START_LEXER() \ + bool result = false; \ + bool skip = false; \ + bool eof = false; \ + int32_t lookahead; \ + goto start; \ + next_state: \ + lexer->advance(lexer, skip); \ + start: \ + skip = false; \ + lookahead = lexer->lookahead; + +#define ADVANCE(state_value) \ + { \ + state = state_value; \ + goto next_state; \ + } + +#define SKIP(state_value) \ + { \ + skip = true; \ + state = state_value; \ + goto next_state; \ + } + +#define ACCEPT_TOKEN(symbol_value) \ + result = true; \ + lexer->result_symbol = symbol_value; \ + lexer->mark_end(lexer); + +#define END_STATE() return result; + +/* + * Parse Table Macros + */ + +#define SMALL_STATE(id) id - LARGE_STATE_COUNT + +#define STATE(id) id + +#define ACTIONS(id) id + +#define SHIFT(state_value) \ + {{ \ + .shift = { \ + .type = TSParseActionTypeShift, \ + .state = state_value \ + } \ + }} + +#define SHIFT_REPEAT(state_value) \ + {{ \ + .shift = { \ + .type = TSParseActionTypeShift, \ + .state = state_value, \ + .repetition = true \ + } \ + }} + +#define SHIFT_EXTRA() \ + {{ \ + .shift = { \ + .type = TSParseActionTypeShift, \ + .extra = true \ + } \ + }} + +#define REDUCE(symbol_val, child_count_val, ...) \ + {{ \ + .reduce = { \ + .type = TSParseActionTypeReduce, \ + .symbol = symbol_val, \ + .child_count = child_count_val, \ + __VA_ARGS__ \ + }, \ + }} + +#define RECOVER() \ + {{ \ + .type = TSParseActionTypeRecover \ + }} + +#define ACCEPT_INPUT() \ + {{ \ + .type = TSParseActionTypeAccept \ + }} + +#ifdef __cplusplus +} +#endif + +#endif // TREE_SITTER_PARSER_H_ diff --git a/test-grammars/markdown-inline/LICENSE b/test-grammars/markdown-inline/LICENSE new file mode 100644 index 0000000..c125939 --- /dev/null +++ b/test-grammars/markdown-inline/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 Matthias Deiml + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/test-grammars/markdown-inline/highlights.scm b/test-grammars/markdown-inline/highlights.scm new file mode 100644 index 0000000..ee0926a --- /dev/null +++ b/test-grammars/markdown-inline/highlights.scm @@ -0,0 +1,39 @@ +;; From nvim-treesitter/nvim-treesitter +[ + (code_span) + (link_title) +] @markup.raw.inline + +[ + (emphasis_delimiter) + (code_span_delimiter) +] @punctuation.bracket + +(emphasis) @markup.italic + +(strong_emphasis) @markup.bold + +(strikethrough) @markup.strikethrough + +[ + (link_destination) + (uri_autolink) +] @markup.link.url + +[ + (link_text) + (image_description) +] @markup.link.text + +(link_label) @markup.link.label + +[ + (backslash_escape) + (hard_line_break) +] @constant.character.escape + +(image ["[" "]" "(" ")"] @punctuation.bracket) +(image "!" @punctuation.special) +(inline_link ["[" "]" "(" ")"] @punctuation.bracket) +(shortcut_link ["[" "]"] @punctuation.bracket) + diff --git a/test-grammars/markdown-inline/injections.scm b/test-grammars/markdown-inline/injections.scm new file mode 100644 index 0000000..62b8267 --- /dev/null +++ b/test-grammars/markdown-inline/injections.scm @@ -0,0 +1,7 @@ + +((html_tag) @injection.content + (#set! injection.language "html") + (#set! injection.include-unnamed-children) + (#set! injection.combined)) + +((latex_block) @injection.content (#set! injection.language "latex") (#set! injection.include-unnamed-children)) diff --git a/test-grammars/markdown-inline/metadata.json b/test-grammars/markdown-inline/metadata.json new file mode 100644 index 0000000..44667f5 --- /dev/null +++ b/test-grammars/markdown-inline/metadata.json @@ -0,0 +1,6 @@ +{ + "repo": "https://github.com/tree-sitter-grammars/tree-sitter-markdown", + "rev": "62516e8c78380e3b51d5b55727995d2c511436d8", + "license": "MIT", + "compressed": true +} \ No newline at end of file diff --git a/test-grammars/markdown-inline/src/grammar.json b/test-grammars/markdown-inline/src/grammar.json new file mode 100644 index 0000000..8ba0b86 Binary files /dev/null and b/test-grammars/markdown-inline/src/grammar.json differ diff --git a/test-grammars/markdown-inline/src/parser.c b/test-grammars/markdown-inline/src/parser.c new file mode 100644 index 0000000..9cf0fcc Binary files /dev/null and b/test-grammars/markdown-inline/src/parser.c differ diff --git a/test-grammars/markdown-inline/src/scanner.c b/test-grammars/markdown-inline/src/scanner.c new file mode 100644 index 0000000..b5e48b4 --- /dev/null +++ b/test-grammars/markdown-inline/src/scanner.c @@ -0,0 +1,397 @@ +#include "tree_sitter/parser.h" + +#ifdef _MSC_VER +#define UNUSED __pragma(warning(suppress : 4101)) +#else +#define UNUSED __attribute__((unused)) +#endif + +// For explanation of the tokens see grammar.js +typedef enum { + ERROR, + TRIGGER_ERROR, + CODE_SPAN_START, + CODE_SPAN_CLOSE, + EMPHASIS_OPEN_STAR, + EMPHASIS_OPEN_UNDERSCORE, + EMPHASIS_CLOSE_STAR, + EMPHASIS_CLOSE_UNDERSCORE, + LAST_TOKEN_WHITESPACE, + LAST_TOKEN_PUNCTUATION, + STRIKETHROUGH_OPEN, + STRIKETHROUGH_CLOSE, + LATEX_SPAN_START, + LATEX_SPAN_CLOSE, + UNCLOSED_SPAN +} TokenType; + +// Determines if a character is punctuation as defined by the markdown spec. +static bool is_punctuation(char chr) { + return (chr >= '!' && chr <= '/') || (chr >= ':' && chr <= '@') || + (chr >= '[' && chr <= '`') || (chr >= '{' && chr <= '~'); +} + +// State bitflags used with `Scanner.state` + +// TODO +static UNUSED const uint8_t STATE_EMPHASIS_DELIMITER_MOD_3 = 0x3; +// Current delimiter run is opening +static const uint8_t STATE_EMPHASIS_DELIMITER_IS_OPEN = 0x1 << 2; + +// Convenience function to emit the error token. This is done to stop invalid +// parse branches. Specifically: +// 1. When encountering a newline after a line break that ended a paragraph, and +// no new block +// has been opened. +// 2. When encountering a new block after a soft line break. +// 3. When a `$._trigger_error` token is valid, which is used to stop parse +// branches through +// normal tree-sitter grammar rules. +// +// See also the `$._soft_line_break` and `$._paragraph_end_newline` tokens in +// grammar.js +static bool error(TSLexer *lexer) { + lexer->result_symbol = ERROR; + return true; +} + +typedef struct { + // Parser state flags + uint8_t state; + uint8_t code_span_delimiter_length; + uint8_t latex_span_delimiter_length; + // The number of characters remaining in the currrent emphasis delimiter + // run. + uint8_t num_emphasis_delimiters_left; + +} Scanner; + +// Write the whole state of a Scanner to a byte buffer +static unsigned serialize(Scanner *s, char *buffer) { + unsigned size = 0; + buffer[size++] = (char)s->state; + buffer[size++] = (char)s->code_span_delimiter_length; + buffer[size++] = (char)s->latex_span_delimiter_length; + buffer[size++] = (char)s->num_emphasis_delimiters_left; + return size; +} + +// Read the whole state of a Scanner from a byte buffer +// `serizalize` and `deserialize` should be fully symmetric. +static void deserialize(Scanner *s, const char *buffer, unsigned length) { + s->state = 0; + s->code_span_delimiter_length = 0; + s->latex_span_delimiter_length = 0; + s->num_emphasis_delimiters_left = 0; + if (length > 0) { + size_t size = 0; + s->state = (uint8_t)buffer[size++]; + s->code_span_delimiter_length = (uint8_t)buffer[size++]; + s->latex_span_delimiter_length = (uint8_t)buffer[size++]; + s->num_emphasis_delimiters_left = (uint8_t)buffer[size++]; + } +} + +static bool parse_leaf_delimiter(TSLexer *lexer, uint8_t *delimiter_length, + const bool *valid_symbols, + const char delimiter, + const TokenType open_token, + const TokenType close_token) { + uint8_t level = 0; + while (lexer->lookahead == delimiter) { + lexer->advance(lexer, false); + level++; + } + lexer->mark_end(lexer); + if (level == *delimiter_length && valid_symbols[close_token]) { + *delimiter_length = 0; + lexer->result_symbol = close_token; + return true; + } + if (valid_symbols[open_token]) { + // Parse ahead to check if there is a closing delimiter + size_t close_level = 0; + while (!lexer->eof(lexer)) { + if (lexer->lookahead == delimiter) { + close_level++; + } else { + if (close_level == level) { + // Found a matching delimiter + break; + } + close_level = 0; + } + lexer->advance(lexer, false); + } + if (close_level == level) { + *delimiter_length = level; + lexer->result_symbol = open_token; + return true; + } + if (valid_symbols[UNCLOSED_SPAN]) { + lexer->result_symbol = UNCLOSED_SPAN; + return true; + } + } + return false; +} + +static bool parse_backtick(Scanner *s, TSLexer *lexer, + const bool *valid_symbols) { + return parse_leaf_delimiter(lexer, &s->code_span_delimiter_length, + valid_symbols, '`', CODE_SPAN_START, + CODE_SPAN_CLOSE); +} + +static bool parse_dollar(Scanner *s, TSLexer *lexer, + const bool *valid_symbols) { + return parse_leaf_delimiter(lexer, &s->latex_span_delimiter_length, + valid_symbols, '$', LATEX_SPAN_START, + LATEX_SPAN_CLOSE); +} + +static bool parse_star(Scanner *s, TSLexer *lexer, const bool *valid_symbols) { + lexer->advance(lexer, false); + // If `num_emphasis_delimiters_left` is not zero then we already decided + // that this should be part of an emphasis delimiter run, so interpret it as + // such. + if (s->num_emphasis_delimiters_left > 0) { + // The `STATE_EMPHASIS_DELIMITER_IS_OPEN` state flag tells us wether it + // should be open or close. + if ((s->state & STATE_EMPHASIS_DELIMITER_IS_OPEN) && + valid_symbols[EMPHASIS_OPEN_STAR]) { + s->state &= (~STATE_EMPHASIS_DELIMITER_IS_OPEN); + lexer->result_symbol = EMPHASIS_OPEN_STAR; + s->num_emphasis_delimiters_left--; + return true; + } + if (valid_symbols[EMPHASIS_CLOSE_STAR]) { + lexer->result_symbol = EMPHASIS_CLOSE_STAR; + s->num_emphasis_delimiters_left--; + return true; + } + } + lexer->mark_end(lexer); + // Otherwise count the number of stars + uint8_t star_count = 1; + while (lexer->lookahead == '*') { + star_count++; + lexer->advance(lexer, false); + } + bool line_end = lexer->lookahead == '\n' || lexer->lookahead == '\r' || + lexer->eof(lexer); + if (valid_symbols[EMPHASIS_OPEN_STAR] || + valid_symbols[EMPHASIS_CLOSE_STAR]) { + // The desicion made for the first star also counts for all the + // following stars in the delimiter run. Rembemer how many there are. + s->num_emphasis_delimiters_left = star_count - 1; + // Look ahead to the next symbol (after the last star) to find out if it + // is whitespace punctuation or other. + bool next_symbol_whitespace = + line_end || lexer->lookahead == ' ' || lexer->lookahead == '\t'; + bool next_symbol_punctuation = is_punctuation((char)lexer->lookahead); + // Information about the last token is in valid_symbols. See grammar.js + // for these tokens for how this is done. + if (valid_symbols[EMPHASIS_CLOSE_STAR] && + !valid_symbols[LAST_TOKEN_WHITESPACE] && + (!valid_symbols[LAST_TOKEN_PUNCTUATION] || + next_symbol_punctuation || next_symbol_whitespace)) { + // Closing delimiters take precedence + s->state &= ~STATE_EMPHASIS_DELIMITER_IS_OPEN; + lexer->result_symbol = EMPHASIS_CLOSE_STAR; + return true; + } + if (!next_symbol_whitespace && (!next_symbol_punctuation || + valid_symbols[LAST_TOKEN_PUNCTUATION] || + valid_symbols[LAST_TOKEN_WHITESPACE])) { + s->state |= STATE_EMPHASIS_DELIMITER_IS_OPEN; + lexer->result_symbol = EMPHASIS_OPEN_STAR; + return true; + } + } + return false; +} + +static bool parse_tilde(Scanner *s, TSLexer *lexer, const bool *valid_symbols) { + lexer->advance(lexer, false); + // If `num_emphasis_delimiters_left` is not zero then we already decided + // that this should be part of an emphasis delimiter run, so interpret it as + // such. + if (s->num_emphasis_delimiters_left > 0) { + // The `STATE_EMPHASIS_DELIMITER_IS_OPEN` state flag tells us wether it + // should be open or close. + if ((s->state & STATE_EMPHASIS_DELIMITER_IS_OPEN) && + valid_symbols[STRIKETHROUGH_OPEN]) { + s->state &= (~STATE_EMPHASIS_DELIMITER_IS_OPEN); + lexer->result_symbol = STRIKETHROUGH_OPEN; + s->num_emphasis_delimiters_left--; + return true; + } + if (valid_symbols[STRIKETHROUGH_CLOSE]) { + lexer->result_symbol = STRIKETHROUGH_CLOSE; + s->num_emphasis_delimiters_left--; + return true; + } + } + lexer->mark_end(lexer); + // Otherwise count the number of tildes + uint8_t star_count = 1; + while (lexer->lookahead == '~') { + star_count++; + lexer->advance(lexer, false); + } + bool line_end = lexer->lookahead == '\n' || lexer->lookahead == '\r' || + lexer->eof(lexer); + if (valid_symbols[STRIKETHROUGH_OPEN] || + valid_symbols[STRIKETHROUGH_CLOSE]) { + // The desicion made for the first star also counts for all the + // following stars in the delimiter run. Rembemer how many there are. + s->num_emphasis_delimiters_left = star_count - 1; + // Look ahead to the next symbol (after the last star) to find out if it + // is whitespace punctuation or other. + bool next_symbol_whitespace = + line_end || lexer->lookahead == ' ' || lexer->lookahead == '\t'; + bool next_symbol_punctuation = is_punctuation((char)lexer->lookahead); + // Information about the last token is in valid_symbols. See grammar.js + // for these tokens for how this is done. + if (valid_symbols[STRIKETHROUGH_CLOSE] && + !valid_symbols[LAST_TOKEN_WHITESPACE] && + (!valid_symbols[LAST_TOKEN_PUNCTUATION] || + next_symbol_punctuation || next_symbol_whitespace)) { + // Closing delimiters take precedence + s->state &= ~STATE_EMPHASIS_DELIMITER_IS_OPEN; + lexer->result_symbol = STRIKETHROUGH_CLOSE; + return true; + } + if (!next_symbol_whitespace && (!next_symbol_punctuation || + valid_symbols[LAST_TOKEN_PUNCTUATION] || + valid_symbols[LAST_TOKEN_WHITESPACE])) { + s->state |= STATE_EMPHASIS_DELIMITER_IS_OPEN; + lexer->result_symbol = STRIKETHROUGH_OPEN; + return true; + } + } + return false; +} + +static bool parse_underscore(Scanner *s, TSLexer *lexer, + const bool *valid_symbols) { + lexer->advance(lexer, false); + // If `num_emphasis_delimiters_left` is not zero then we already decided + // that this should be part of an emphasis delimiter run, so interpret it as + // such. + if (s->num_emphasis_delimiters_left > 0) { + // The `STATE_EMPHASIS_DELIMITER_IS_OPEN` state flag tells us wether it + // should be open or close. + if ((s->state & STATE_EMPHASIS_DELIMITER_IS_OPEN) && + valid_symbols[EMPHASIS_OPEN_UNDERSCORE]) { + s->state &= (~STATE_EMPHASIS_DELIMITER_IS_OPEN); + lexer->result_symbol = EMPHASIS_OPEN_UNDERSCORE; + s->num_emphasis_delimiters_left--; + return true; + } + if (valid_symbols[EMPHASIS_CLOSE_UNDERSCORE]) { + lexer->result_symbol = EMPHASIS_CLOSE_UNDERSCORE; + s->num_emphasis_delimiters_left--; + return true; + } + } + lexer->mark_end(lexer); + // Otherwise count the number of stars + uint8_t underscore_count = 1; + while (lexer->lookahead == '_') { + underscore_count++; + lexer->advance(lexer, false); + } + bool line_end = lexer->lookahead == '\n' || lexer->lookahead == '\r' || + lexer->eof(lexer); + if (valid_symbols[EMPHASIS_OPEN_UNDERSCORE] || + valid_symbols[EMPHASIS_CLOSE_UNDERSCORE]) { + // The desicion made for the first underscore also counts for all the + // following underscores in the delimiter run. Rembemer how many there are. + s->num_emphasis_delimiters_left = underscore_count - 1; + // Look ahead to the next symbol (after the last underscore) to find out if it + // is whitespace punctuation or other. + bool next_symbol_whitespace = + line_end || lexer->lookahead == ' ' || lexer->lookahead == '\t'; + bool next_symbol_punctuation = is_punctuation((char)lexer->lookahead); + // Information about the last token is in valid_symbols. See grammar.js + // for these tokens for how this is done. + if (valid_symbols[EMPHASIS_CLOSE_UNDERSCORE] && + !valid_symbols[LAST_TOKEN_WHITESPACE] && + (!valid_symbols[LAST_TOKEN_PUNCTUATION] || + next_symbol_punctuation || next_symbol_whitespace)) { + // Closing delimiters take precedence + s->state &= ~STATE_EMPHASIS_DELIMITER_IS_OPEN; + lexer->result_symbol = EMPHASIS_CLOSE_UNDERSCORE; + return true; + } + if (!next_symbol_whitespace && (!next_symbol_punctuation || + valid_symbols[LAST_TOKEN_PUNCTUATION] || + valid_symbols[LAST_TOKEN_WHITESPACE])) { + s->state |= STATE_EMPHASIS_DELIMITER_IS_OPEN; + lexer->result_symbol = EMPHASIS_OPEN_UNDERSCORE; + return true; + } + } + return false; +} + +static bool scan(Scanner *s, TSLexer *lexer, const bool *valid_symbols) { + // A normal tree-sitter rule decided that the current branch is invalid and + // now "requests" an error to stop the branch + if (valid_symbols[TRIGGER_ERROR]) { + return error(lexer); + } + + // Decide which tokens to consider based on the first non-whitespace + // character + switch (lexer->lookahead) { + case '`': + // A backtick could mark the beginning or ending of a code span or a + // fenced code block. + return parse_backtick(s, lexer, valid_symbols); + case '$': + return parse_dollar(s, lexer, valid_symbols); + case '*': + // A star could either mark the beginning or ending of emphasis, a + // list item or thematic break. This code is similar to the code for + // '_' and '+'. + return parse_star(s, lexer, valid_symbols); + case '_': + return parse_underscore(s, lexer, valid_symbols); + case '~': + return parse_tilde(s, lexer, valid_symbols); + } + return false; +} + +void *tree_sitter_markdown_inline_external_scanner_create() { + Scanner *s = (Scanner *)malloc(sizeof(Scanner)); + deserialize(s, NULL, 0); + return s; +} + +bool tree_sitter_markdown_inline_external_scanner_scan( + void *payload, TSLexer *lexer, const bool *valid_symbols) { + Scanner *scanner = (Scanner *)payload; + return scan(scanner, lexer, valid_symbols); +} + +unsigned tree_sitter_markdown_inline_external_scanner_serialize(void *payload, + char *buffer) { + Scanner *scanner = (Scanner *)payload; + return serialize(scanner, buffer); +} + +void tree_sitter_markdown_inline_external_scanner_deserialize(void *payload, + char *buffer, + unsigned length) { + Scanner *scanner = (Scanner *)payload; + deserialize(scanner, buffer, length); +} + +void tree_sitter_markdown_inline_external_scanner_destroy(void *payload) { + Scanner *scanner = (Scanner *)payload; + free(scanner); +} diff --git a/test-grammars/markdown-inline/src/tree_sitter/alloc.h b/test-grammars/markdown-inline/src/tree_sitter/alloc.h new file mode 100644 index 0000000..1f4466d --- /dev/null +++ b/test-grammars/markdown-inline/src/tree_sitter/alloc.h @@ -0,0 +1,54 @@ +#ifndef TREE_SITTER_ALLOC_H_ +#define TREE_SITTER_ALLOC_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +// Allow clients to override allocation functions +#ifdef TREE_SITTER_REUSE_ALLOCATOR + +extern void *(*ts_current_malloc)(size_t); +extern void *(*ts_current_calloc)(size_t, size_t); +extern void *(*ts_current_realloc)(void *, size_t); +extern void (*ts_current_free)(void *); + +#ifndef ts_malloc +#define ts_malloc ts_current_malloc +#endif +#ifndef ts_calloc +#define ts_calloc ts_current_calloc +#endif +#ifndef ts_realloc +#define ts_realloc ts_current_realloc +#endif +#ifndef ts_free +#define ts_free ts_current_free +#endif + +#else + +#ifndef ts_malloc +#define ts_malloc malloc +#endif +#ifndef ts_calloc +#define ts_calloc calloc +#endif +#ifndef ts_realloc +#define ts_realloc realloc +#endif +#ifndef ts_free +#define ts_free free +#endif + +#endif + +#ifdef __cplusplus +} +#endif + +#endif // TREE_SITTER_ALLOC_H_ diff --git a/test-grammars/markdown-inline/src/tree_sitter/array.h b/test-grammars/markdown-inline/src/tree_sitter/array.h new file mode 100644 index 0000000..186ba67 --- /dev/null +++ b/test-grammars/markdown-inline/src/tree_sitter/array.h @@ -0,0 +1,287 @@ +#ifndef TREE_SITTER_ARRAY_H_ +#define TREE_SITTER_ARRAY_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "./alloc.h" + +#include +#include +#include +#include +#include + +#ifdef _MSC_VER +#pragma warning(disable : 4101) +#elif defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-variable" +#endif + +#define Array(T) \ + struct { \ + T *contents; \ + uint32_t size; \ + uint32_t capacity; \ + } + +/// Initialize an array. +#define array_init(self) \ + ((self)->size = 0, (self)->capacity = 0, (self)->contents = NULL) + +/// Create an empty array. +#define array_new() \ + { NULL, 0, 0 } + +/// Get a pointer to the element at a given `index` in the array. +#define array_get(self, _index) \ + (assert((uint32_t)(_index) < (self)->size), &(self)->contents[_index]) + +/// Get a pointer to the first element in the array. +#define array_front(self) array_get(self, 0) + +/// Get a pointer to the last element in the array. +#define array_back(self) array_get(self, (self)->size - 1) + +/// Clear the array, setting its size to zero. Note that this does not free any +/// memory allocated for the array's contents. +#define array_clear(self) ((self)->size = 0) + +/// Reserve `new_capacity` elements of space in the array. If `new_capacity` is +/// less than the array's current capacity, this function has no effect. +#define array_reserve(self, new_capacity) \ + _array__reserve((Array *)(self), array_elem_size(self), new_capacity) + +/// Free any memory allocated for this array. Note that this does not free any +/// memory allocated for the array's contents. +#define array_delete(self) _array__delete((Array *)(self)) + +/// Push a new `element` onto the end of the array. +#define array_push(self, element) \ + (_array__grow((Array *)(self), 1, array_elem_size(self)), \ + (self)->contents[(self)->size++] = (element)) + +/// Increase the array's size by `count` elements. +/// New elements are zero-initialized. +#define array_grow_by(self, count) \ + (_array__grow((Array *)(self), count, array_elem_size(self)), \ + memset((self)->contents + (self)->size, 0, (count) * array_elem_size(self)), \ + (self)->size += (count)) + +/// Append all elements from one array to the end of another. +#define array_push_all(self, other) \ + array_extend((self), (other)->size, (other)->contents) + +/// Append `count` elements to the end of the array, reading their values from the +/// `contents` pointer. +#define array_extend(self, count, contents) \ + _array__splice( \ + (Array *)(self), array_elem_size(self), (self)->size, \ + 0, count, contents \ + ) + +/// Remove `old_count` elements from the array starting at the given `index`. At +/// the same index, insert `new_count` new elements, reading their values from the +/// `new_contents` pointer. +#define array_splice(self, _index, old_count, new_count, new_contents) \ + _array__splice( \ + (Array *)(self), array_elem_size(self), _index, \ + old_count, new_count, new_contents \ + ) + +/// Insert one `element` into the array at the given `index`. +#define array_insert(self, _index, element) \ + _array__splice((Array *)(self), array_elem_size(self), _index, 0, 1, &(element)) + +/// Remove one element from the array at the given `index`. +#define array_erase(self, _index) \ + _array__erase((Array *)(self), array_elem_size(self), _index) + +/// Pop the last element off the array, returning the element by value. +#define array_pop(self) ((self)->contents[--(self)->size]) + +/// Assign the contents of one array to another, reallocating if necessary. +#define array_assign(self, other) \ + _array__assign((Array *)(self), (const Array *)(other), array_elem_size(self)) + +/// Swap one array with another +#define array_swap(self, other) \ + _array__swap((Array *)(self), (Array *)(other)) + +/// Get the size of the array contents +#define array_elem_size(self) (sizeof *(self)->contents) + +/// Search a sorted array for a given `needle` value, using the given `compare` +/// callback to determine the order. +/// +/// If an existing element is found to be equal to `needle`, then the `index` +/// out-parameter is set to the existing value's index, and the `exists` +/// out-parameter is set to true. Otherwise, `index` is set to an index where +/// `needle` should be inserted in order to preserve the sorting, and `exists` +/// is set to false. +#define array_search_sorted_with(self, compare, needle, _index, _exists) \ + _array__search_sorted(self, 0, compare, , needle, _index, _exists) + +/// Search a sorted array for a given `needle` value, using integer comparisons +/// of a given struct field (specified with a leading dot) to determine the order. +/// +/// See also `array_search_sorted_with`. +#define array_search_sorted_by(self, field, needle, _index, _exists) \ + _array__search_sorted(self, 0, _compare_int, field, needle, _index, _exists) + +/// Insert a given `value` into a sorted array, using the given `compare` +/// callback to determine the order. +#define array_insert_sorted_with(self, compare, value) \ + do { \ + unsigned _index, _exists; \ + array_search_sorted_with(self, compare, &(value), &_index, &_exists); \ + if (!_exists) array_insert(self, _index, value); \ + } while (0) + +/// Insert a given `value` into a sorted array, using integer comparisons of +/// a given struct field (specified with a leading dot) to determine the order. +/// +/// See also `array_search_sorted_by`. +#define array_insert_sorted_by(self, field, value) \ + do { \ + unsigned _index, _exists; \ + array_search_sorted_by(self, field, (value) field, &_index, &_exists); \ + if (!_exists) array_insert(self, _index, value); \ + } while (0) + +// Private + +typedef Array(void) Array; + +/// This is not what you're looking for, see `array_delete`. +static inline void _array__delete(Array *self) { + if (self->contents) { + ts_free(self->contents); + self->contents = NULL; + self->size = 0; + self->capacity = 0; + } +} + +/// This is not what you're looking for, see `array_erase`. +static inline void _array__erase(Array *self, size_t element_size, + uint32_t index) { + assert(index < self->size); + char *contents = (char *)self->contents; + memmove(contents + index * element_size, contents + (index + 1) * element_size, + (self->size - index - 1) * element_size); + self->size--; +} + +/// This is not what you're looking for, see `array_reserve`. +static inline void _array__reserve(Array *self, size_t element_size, uint32_t new_capacity) { + if (new_capacity > self->capacity) { + if (self->contents) { + self->contents = ts_realloc(self->contents, new_capacity * element_size); + } else { + self->contents = ts_malloc(new_capacity * element_size); + } + self->capacity = new_capacity; + } +} + +/// This is not what you're looking for, see `array_assign`. +static inline void _array__assign(Array *self, const Array *other, size_t element_size) { + _array__reserve(self, element_size, other->size); + self->size = other->size; + memcpy(self->contents, other->contents, self->size * element_size); +} + +/// This is not what you're looking for, see `array_swap`. +static inline void _array__swap(Array *self, Array *other) { + Array swap = *other; + *other = *self; + *self = swap; +} + +/// This is not what you're looking for, see `array_push` or `array_grow_by`. +static inline void _array__grow(Array *self, uint32_t count, size_t element_size) { + uint32_t new_size = self->size + count; + if (new_size > self->capacity) { + uint32_t new_capacity = self->capacity * 2; + if (new_capacity < 8) new_capacity = 8; + if (new_capacity < new_size) new_capacity = new_size; + _array__reserve(self, element_size, new_capacity); + } +} + +/// This is not what you're looking for, see `array_splice`. +static inline void _array__splice(Array *self, size_t element_size, + uint32_t index, uint32_t old_count, + uint32_t new_count, const void *elements) { + uint32_t new_size = self->size + new_count - old_count; + uint32_t old_end = index + old_count; + uint32_t new_end = index + new_count; + assert(old_end <= self->size); + + _array__reserve(self, element_size, new_size); + + char *contents = (char *)self->contents; + if (self->size > old_end) { + memmove( + contents + new_end * element_size, + contents + old_end * element_size, + (self->size - old_end) * element_size + ); + } + if (new_count > 0) { + if (elements) { + memcpy( + (contents + index * element_size), + elements, + new_count * element_size + ); + } else { + memset( + (contents + index * element_size), + 0, + new_count * element_size + ); + } + } + self->size += new_count - old_count; +} + +/// A binary search routine, based on Rust's `std::slice::binary_search_by`. +/// This is not what you're looking for, see `array_search_sorted_with` or `array_search_sorted_by`. +#define _array__search_sorted(self, start, compare, suffix, needle, _index, _exists) \ + do { \ + *(_index) = start; \ + *(_exists) = false; \ + uint32_t size = (self)->size - *(_index); \ + if (size == 0) break; \ + int comparison; \ + while (size > 1) { \ + uint32_t half_size = size / 2; \ + uint32_t mid_index = *(_index) + half_size; \ + comparison = compare(&((self)->contents[mid_index] suffix), (needle)); \ + if (comparison <= 0) *(_index) = mid_index; \ + size -= half_size; \ + } \ + comparison = compare(&((self)->contents[*(_index)] suffix), (needle)); \ + if (comparison == 0) *(_exists) = true; \ + else if (comparison < 0) *(_index) += 1; \ + } while (0) + +/// Helper macro for the `_sorted_by` routines below. This takes the left (existing) +/// parameter by reference in order to work with the generic sorting function above. +#define _compare_int(a, b) ((int)*(a) - (int)(b)) + +#ifdef _MSC_VER +#pragma warning(default : 4101) +#elif defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic pop +#endif + +#ifdef __cplusplus +} +#endif + +#endif // TREE_SITTER_ARRAY_H_ diff --git a/test-grammars/markdown-inline/src/tree_sitter/parser.h b/test-grammars/markdown-inline/src/tree_sitter/parser.h new file mode 100644 index 0000000..17b4fde --- /dev/null +++ b/test-grammars/markdown-inline/src/tree_sitter/parser.h @@ -0,0 +1,230 @@ +#ifndef TREE_SITTER_PARSER_H_ +#define TREE_SITTER_PARSER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +#define ts_builtin_sym_error ((TSSymbol)-1) +#define ts_builtin_sym_end 0 +#define TREE_SITTER_SERIALIZATION_BUFFER_SIZE 1024 + +#ifndef TREE_SITTER_API_H_ +typedef uint16_t TSStateId; +typedef uint16_t TSSymbol; +typedef uint16_t TSFieldId; +typedef struct TSLanguage TSLanguage; +#endif + +typedef struct { + TSFieldId field_id; + uint8_t child_index; + bool inherited; +} TSFieldMapEntry; + +typedef struct { + uint16_t index; + uint16_t length; +} TSFieldMapSlice; + +typedef struct { + bool visible; + bool named; + bool supertype; +} TSSymbolMetadata; + +typedef struct TSLexer TSLexer; + +struct TSLexer { + int32_t lookahead; + TSSymbol result_symbol; + void (*advance)(TSLexer *, bool); + void (*mark_end)(TSLexer *); + uint32_t (*get_column)(TSLexer *); + bool (*is_at_included_range_start)(const TSLexer *); + bool (*eof)(const TSLexer *); +}; + +typedef enum { + TSParseActionTypeShift, + TSParseActionTypeReduce, + TSParseActionTypeAccept, + TSParseActionTypeRecover, +} TSParseActionType; + +typedef union { + struct { + uint8_t type; + TSStateId state; + bool extra; + bool repetition; + } shift; + struct { + uint8_t type; + uint8_t child_count; + TSSymbol symbol; + int16_t dynamic_precedence; + uint16_t production_id; + } reduce; + uint8_t type; +} TSParseAction; + +typedef struct { + uint16_t lex_state; + uint16_t external_lex_state; +} TSLexMode; + +typedef union { + TSParseAction action; + struct { + uint8_t count; + bool reusable; + } entry; +} TSParseActionEntry; + +struct TSLanguage { + uint32_t version; + uint32_t symbol_count; + uint32_t alias_count; + uint32_t token_count; + uint32_t external_token_count; + uint32_t state_count; + uint32_t large_state_count; + uint32_t production_id_count; + uint32_t field_count; + uint16_t max_alias_sequence_length; + const uint16_t *parse_table; + const uint16_t *small_parse_table; + const uint32_t *small_parse_table_map; + const TSParseActionEntry *parse_actions; + const char * const *symbol_names; + const char * const *field_names; + const TSFieldMapSlice *field_map_slices; + const TSFieldMapEntry *field_map_entries; + const TSSymbolMetadata *symbol_metadata; + const TSSymbol *public_symbol_map; + const uint16_t *alias_map; + const TSSymbol *alias_sequences; + const TSLexMode *lex_modes; + bool (*lex_fn)(TSLexer *, TSStateId); + bool (*keyword_lex_fn)(TSLexer *, TSStateId); + TSSymbol keyword_capture_token; + struct { + const bool *states; + const TSSymbol *symbol_map; + void *(*create)(void); + void (*destroy)(void *); + bool (*scan)(void *, TSLexer *, const bool *symbol_whitelist); + unsigned (*serialize)(void *, char *); + void (*deserialize)(void *, const char *, unsigned); + } external_scanner; + const TSStateId *primary_state_ids; +}; + +/* + * Lexer Macros + */ + +#ifdef _MSC_VER +#define UNUSED __pragma(warning(suppress : 4101)) +#else +#define UNUSED __attribute__((unused)) +#endif + +#define START_LEXER() \ + bool result = false; \ + bool skip = false; \ + UNUSED \ + bool eof = false; \ + int32_t lookahead; \ + goto start; \ + next_state: \ + lexer->advance(lexer, skip); \ + start: \ + skip = false; \ + lookahead = lexer->lookahead; + +#define ADVANCE(state_value) \ + { \ + state = state_value; \ + goto next_state; \ + } + +#define SKIP(state_value) \ + { \ + skip = true; \ + state = state_value; \ + goto next_state; \ + } + +#define ACCEPT_TOKEN(symbol_value) \ + result = true; \ + lexer->result_symbol = symbol_value; \ + lexer->mark_end(lexer); + +#define END_STATE() return result; + +/* + * Parse Table Macros + */ + +#define SMALL_STATE(id) ((id) - LARGE_STATE_COUNT) + +#define STATE(id) id + +#define ACTIONS(id) id + +#define SHIFT(state_value) \ + {{ \ + .shift = { \ + .type = TSParseActionTypeShift, \ + .state = (state_value) \ + } \ + }} + +#define SHIFT_REPEAT(state_value) \ + {{ \ + .shift = { \ + .type = TSParseActionTypeShift, \ + .state = (state_value), \ + .repetition = true \ + } \ + }} + +#define SHIFT_EXTRA() \ + {{ \ + .shift = { \ + .type = TSParseActionTypeShift, \ + .extra = true \ + } \ + }} + +#define REDUCE(symbol_val, child_count_val, ...) \ + {{ \ + .reduce = { \ + .type = TSParseActionTypeReduce, \ + .symbol = symbol_val, \ + .child_count = child_count_val, \ + __VA_ARGS__ \ + }, \ + }} + +#define RECOVER() \ + {{ \ + .type = TSParseActionTypeRecover \ + }} + +#define ACCEPT_INPUT() \ + {{ \ + .type = TSParseActionTypeAccept \ + }} + +#ifdef __cplusplus +} +#endif + +#endif // TREE_SITTER_PARSER_H_ diff --git a/test-grammars/markdown/LICENSE b/test-grammars/markdown/LICENSE new file mode 100644 index 0000000..c125939 --- /dev/null +++ b/test-grammars/markdown/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 Matthias Deiml + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/test-grammars/markdown/highlights.scm b/test-grammars/markdown/highlights.scm new file mode 100644 index 0000000..a80fc1b --- /dev/null +++ b/test-grammars/markdown/highlights.scm @@ -0,0 +1,62 @@ + +(setext_heading (paragraph) @markup.heading.1 (setext_h1_underline) @markup.heading.marker) +(setext_heading (paragraph) @markup.heading.2 (setext_h2_underline) @markup.heading.marker) + +(atx_heading (atx_h1_marker) @markup.heading.marker) @markup.heading.1 +(atx_heading (atx_h2_marker) @markup.heading.marker) @markup.heading.2 +(atx_heading (atx_h3_marker) @markup.heading.marker) @markup.heading.3 +(atx_heading (atx_h4_marker) @markup.heading.marker) @markup.heading.4 +(atx_heading (atx_h5_marker) @markup.heading.marker) @markup.heading.5 +(atx_heading (atx_h6_marker) @markup.heading.marker) @markup.heading.6 + +[ + (indented_code_block) + (fenced_code_block) +] @markup.raw.block + +(info_string) @label + +[ + (fenced_code_block_delimiter) +] @punctuation.bracket + +[ + (link_destination) +] @markup.link.url + +[ + (link_label) +] @markup.link.label + +[ + (list_marker_plus) + (list_marker_minus) + (list_marker_star) +] @markup.list.unnumbered + +[ + (list_marker_dot) + (list_marker_parenthesis) +] @markup.list.numbered + +(task_list_marker_checked) @markup.list.checked +(task_list_marker_unchecked) @markup.list.unchecked + +(thematic_break) @punctuation.special + +[ + (block_continuation) + (block_quote_marker) +] @punctuation.special + +[ + (backslash_escape) +] @string.escape + +(block_quote) @markup.quote + +(pipe_table_row + "|" @punctuation.special) +(pipe_table_header + "|" @punctuation.special) +(pipe_table_delimiter_row) @punctuation.special diff --git a/test-grammars/markdown/injections.scm b/test-grammars/markdown/injections.scm new file mode 100644 index 0000000..7ed09c6 --- /dev/null +++ b/test-grammars/markdown/injections.scm @@ -0,0 +1,22 @@ +; From nvim-treesitter/nvim-treesitter + +(fenced_code_block + (code_fence_content) @injection.shebang @injection.content + (#set! injection.include-unnamed-children)) + +(fenced_code_block + (info_string + (language) @injection.language) + (code_fence_content) @injection.content (#set! injection.include-unnamed-children)) + +((html_block) @injection.content + (#set! injection.language "html") + (#set! injection.include-unnamed-children) + (#set! injection.combined)) + +((pipe_table_cell) @injection.content (#set! injection.language "markdown-inline") (#set! injection.include-unnamed-children)) + +((minus_metadata) @injection.content (#set! injection.language "yaml") (#set! injection.include-unnamed-children)) +((plus_metadata) @injection.content (#set! injection.language "toml") (#set! injection.include-unnamed-children)) + +((inline) @injection.content (#set! injection.language "markdown-inline") (#set! injection.include-unnamed-children)) diff --git a/test-grammars/markdown/metadata.json b/test-grammars/markdown/metadata.json new file mode 100644 index 0000000..44667f5 --- /dev/null +++ b/test-grammars/markdown/metadata.json @@ -0,0 +1,6 @@ +{ + "repo": "https://github.com/tree-sitter-grammars/tree-sitter-markdown", + "rev": "62516e8c78380e3b51d5b55727995d2c511436d8", + "license": "MIT", + "compressed": true +} \ No newline at end of file diff --git a/test-grammars/markdown/src/grammar.json b/test-grammars/markdown/src/grammar.json new file mode 100644 index 0000000..963b8f9 Binary files /dev/null and b/test-grammars/markdown/src/grammar.json differ diff --git a/test-grammars/markdown/src/parser.c b/test-grammars/markdown/src/parser.c new file mode 100644 index 0000000..df56218 Binary files /dev/null and b/test-grammars/markdown/src/parser.c differ diff --git a/test-grammars/markdown/src/scanner.c b/test-grammars/markdown/src/scanner.c new file mode 100644 index 0000000..969e806 --- /dev/null +++ b/test-grammars/markdown/src/scanner.c @@ -0,0 +1,1597 @@ +#include "tree_sitter/parser.h" +#include +#include +#include +#include +#include + +// For explanation of the tokens see grammar.js +typedef enum { + LINE_ENDING, + SOFT_LINE_ENDING, + BLOCK_CLOSE, + BLOCK_CONTINUATION, + BLOCK_QUOTE_START, + INDENTED_CHUNK_START, + ATX_H1_MARKER, + ATX_H2_MARKER, + ATX_H3_MARKER, + ATX_H4_MARKER, + ATX_H5_MARKER, + ATX_H6_MARKER, + SETEXT_H1_UNDERLINE, + SETEXT_H2_UNDERLINE, + THEMATIC_BREAK, + LIST_MARKER_MINUS, + LIST_MARKER_PLUS, + LIST_MARKER_STAR, + LIST_MARKER_PARENTHESIS, + LIST_MARKER_DOT, + LIST_MARKER_MINUS_DONT_INTERRUPT, + LIST_MARKER_PLUS_DONT_INTERRUPT, + LIST_MARKER_STAR_DONT_INTERRUPT, + LIST_MARKER_PARENTHESIS_DONT_INTERRUPT, + LIST_MARKER_DOT_DONT_INTERRUPT, + FENCED_CODE_BLOCK_START_BACKTICK, + FENCED_CODE_BLOCK_START_TILDE, + BLANK_LINE_START, + FENCED_CODE_BLOCK_END_BACKTICK, + FENCED_CODE_BLOCK_END_TILDE, + HTML_BLOCK_1_START, + HTML_BLOCK_1_END, + HTML_BLOCK_2_START, + HTML_BLOCK_3_START, + HTML_BLOCK_4_START, + HTML_BLOCK_5_START, + HTML_BLOCK_6_START, + HTML_BLOCK_7_START, + CLOSE_BLOCK, + NO_INDENTED_CHUNK, + ERROR, + TRIGGER_ERROR, + TOKEN_EOF, + MINUS_METADATA, + PLUS_METADATA, + PIPE_TABLE_START, + PIPE_TABLE_LINE_ENDING, +} TokenType; + +// Description of a block on the block stack. +// +// LIST_ITEM is a list item with minimal indentation (content begins at indent +// level 2) while LIST_ITEM_MAX_INDENTATION represents a list item with maximal +// indentation without being considered a indented code block. +// +// ANONYMOUS represents any block that whose close is not handled by the +// external s. +typedef enum { + BLOCK_QUOTE, + INDENTED_CODE_BLOCK, + LIST_ITEM, + LIST_ITEM_1_INDENTATION, + LIST_ITEM_2_INDENTATION, + LIST_ITEM_3_INDENTATION, + LIST_ITEM_4_INDENTATION, + LIST_ITEM_5_INDENTATION, + LIST_ITEM_6_INDENTATION, + LIST_ITEM_7_INDENTATION, + LIST_ITEM_8_INDENTATION, + LIST_ITEM_9_INDENTATION, + LIST_ITEM_10_INDENTATION, + LIST_ITEM_11_INDENTATION, + LIST_ITEM_12_INDENTATION, + LIST_ITEM_13_INDENTATION, + LIST_ITEM_14_INDENTATION, + LIST_ITEM_MAX_INDENTATION, + FENCED_CODE_BLOCK, + ANONYMOUS, +} Block; + +// Determines if a character is punctuation as defined by the markdown spec. +static bool is_punctuation(char chr) { + return (chr >= '!' && chr <= '/') || (chr >= ':' && chr <= '@') || + (chr >= '[' && chr <= '`') || (chr >= '{' && chr <= '~'); +} + +// Returns the indentation level which lines of a list item should have at +// minimum. Should only be called with blocks for which `is_list_item` returns +// true. +static uint8_t list_item_indentation(Block block) { + return (uint8_t)(block - LIST_ITEM + 2); +} + +#define NUM_HTML_TAG_NAMES_RULE_1 3 + +static const char *const HTML_TAG_NAMES_RULE_1[NUM_HTML_TAG_NAMES_RULE_1] = { + "pre", "script", "style"}; + +#define NUM_HTML_TAG_NAMES_RULE_7 62 + +static const char *const HTML_TAG_NAMES_RULE_7[NUM_HTML_TAG_NAMES_RULE_7] = { + "address", "article", "aside", "base", "basefont", "blockquote", + "body", "caption", "center", "col", "colgroup", "dd", + "details", "dialog", "dir", "div", "dl", "dt", + "fieldset", "figcaption", "figure", "footer", "form", "frame", + "frameset", "h1", "h2", "h3", "h4", "h5", + "h6", "head", "header", "hr", "html", "iframe", + "legend", "li", "link", "main", "menu", "menuitem", + "nav", "noframes", "ol", "optgroup", "option", "p", + "param", "section", "source", "summary", "table", "tbody", + "td", "tfoot", "th", "thead", "title", "tr", + "track", "ul"}; + +// For explanation of the tokens see grammar.js +static const bool paragraph_interrupt_symbols[] = { + false, // LINE_ENDING, + false, // SOFT_LINE_ENDING, + false, // BLOCK_CLOSE, + false, // BLOCK_CONTINUATION, + true, // BLOCK_QUOTE_START, + false, // INDENTED_CHUNK_START, + true, // ATX_H1_MARKER, + true, // ATX_H2_MARKER, + true, // ATX_H3_MARKER, + true, // ATX_H4_MARKER, + true, // ATX_H5_MARKER, + true, // ATX_H6_MARKER, + true, // SETEXT_H1_UNDERLINE, + true, // SETEXT_H2_UNDERLINE, + true, // THEMATIC_BREAK, + true, // LIST_MARKER_MINUS, + true, // LIST_MARKER_PLUS, + true, // LIST_MARKER_STAR, + true, // LIST_MARKER_PARENTHESIS, + true, // LIST_MARKER_DOT, + false, // LIST_MARKER_MINUS_DONT_INTERRUPT, + false, // LIST_MARKER_PLUS_DONT_INTERRUPT, + false, // LIST_MARKER_STAR_DONT_INTERRUPT, + false, // LIST_MARKER_PARENTHESIS_DONT_INTERRUPT, + false, // LIST_MARKER_DOT_DONT_INTERRUPT, + true, // FENCED_CODE_BLOCK_START_BACKTICK, + true, // FENCED_CODE_BLOCK_START_TILDE, + true, // BLANK_LINE_START, + false, // FENCED_CODE_BLOCK_END_BACKTICK, + false, // FENCED_CODE_BLOCK_END_TILDE, + true, // HTML_BLOCK_1_START, + false, // HTML_BLOCK_1_END, + true, // HTML_BLOCK_2_START, + true, // HTML_BLOCK_3_START, + true, // HTML_BLOCK_4_START, + true, // HTML_BLOCK_5_START, + true, // HTML_BLOCK_6_START, + false, // HTML_BLOCK_7_START, + false, // CLOSE_BLOCK, + false, // NO_INDENTED_CHUNK, + false, // ERROR, + false, // TRIGGER_ERROR, + false, // EOF, + false, // MINUS_METADATA, + false, // PLUS_METADATA, + true, // PIPE_TABLE_START, + false, // PIPE_TABLE_LINE_ENDING, +}; + +// State bitflags used with `Scanner.state` + +// Currently matching (at the beginning of a line) +static const uint8_t STATE_MATCHING = 0x1 << 0; +// Last line break was inside a paragraph +static const uint8_t STATE_WAS_SOFT_LINE_BREAK = 0x1 << 1; +// Block should be closed after next line break +static const uint8_t STATE_CLOSE_BLOCK = 0x1 << 4; + +static size_t roundup_32(size_t x) { + x--; + + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + + x++; + + return x; +} + +typedef struct { + // A stack of open blocks in the current parse state + struct { + size_t size; + size_t capacity; + Block *items; + } open_blocks; + + // Parser state flags + uint8_t state; + // Number of blocks that have been matched so far. Only changes during + // matching and is reset after every line ending + uint8_t matched; + // Consumed but "unused" indentation. Sometimes a tab needs to be "split" to + // be used in multiple tokens. + uint8_t indentation; + // The current column. Used to decide how many spaces a tab should equal + uint8_t column; + // The delimiter length of the currently open fenced code block + uint8_t fenced_code_block_delimiter_length; + + bool simulate; +} Scanner; + +static void push_block(Scanner *s, Block b) { + if (s->open_blocks.size == s->open_blocks.capacity) { + s->open_blocks.capacity = + s->open_blocks.capacity ? s->open_blocks.capacity << 1 : 8; + void *tmp = realloc(s->open_blocks.items, + sizeof(Block) * s->open_blocks.capacity); + assert(tmp != NULL); + s->open_blocks.items = tmp; + } + + s->open_blocks.items[s->open_blocks.size++] = b; +} + +static inline Block pop_block(Scanner *s) { + return s->open_blocks.items[--s->open_blocks.size]; +} + +// Write the whole state of a Scanner to a byte buffer +static unsigned serialize(Scanner *s, char *buffer) { + unsigned size = 0; + buffer[size++] = (char)s->state; + buffer[size++] = (char)s->matched; + buffer[size++] = (char)s->indentation; + buffer[size++] = (char)s->column; + buffer[size++] = (char)s->fenced_code_block_delimiter_length; + size_t blocks_count = s->open_blocks.size; + if (blocks_count > 0) { + memcpy(&buffer[size], s->open_blocks.items, + blocks_count * sizeof(Block)); + size += blocks_count * sizeof(Block); + } + return size; +} + +// Read the whole state of a Scanner from a byte buffer +// `serizalize` and `deserialize` should be fully symmetric. +static void deserialize(Scanner *s, const char *buffer, unsigned length) { + s->open_blocks.size = 0; + s->open_blocks.capacity = 0; + s->state = 0; + s->matched = 0; + s->indentation = 0; + s->column = 0; + s->fenced_code_block_delimiter_length = 0; + if (length > 0) { + size_t size = 0; + s->state = (uint8_t)buffer[size++]; + s->matched = (uint8_t)buffer[size++]; + s->indentation = (uint8_t)buffer[size++]; + s->column = (uint8_t)buffer[size++]; + s->fenced_code_block_delimiter_length = (uint8_t)buffer[size++]; + size_t blocks_size = length - size; + if (blocks_size > 0) { + size_t blocks_count = blocks_size / sizeof(Block); + + // ensure open blocks has enough room + if (s->open_blocks.capacity < blocks_count) { + size_t capacity = roundup_32(blocks_count); + void *tmp = realloc(s->open_blocks.items, + sizeof(Block) * capacity); + assert(tmp != NULL); + s->open_blocks.items = tmp; + s->open_blocks.capacity = capacity; + } + memcpy(s->open_blocks.items, &buffer[size], blocks_size); + s->open_blocks.size = blocks_count; + } + } +} + +static void mark_end(Scanner *s, TSLexer *lexer) { + if (!s->simulate) { + lexer->mark_end(lexer); + } +} + +// Convenience function to emit the error token. This is done to stop invalid +// parse branches. Specifically: +// 1. When encountering a newline after a line break that ended a paragraph, and +// no new block +// has been opened. +// 2. When encountering a new block after a soft line break. +// 3. When a `$._trigger_error` token is valid, which is used to stop parse +// branches through +// normal tree-sitter grammar rules. +// +// See also the `$._soft_line_break` and `$._paragraph_end_newline` tokens in +// grammar.js +static bool error(TSLexer *lexer) { + lexer->result_symbol = ERROR; + return true; +} + +// Advance the lexer one character +// Also keeps track of the current column, counting tabs as spaces with tab stop +// 4 See https://github.github.com/gfm/#tabs +static size_t advance(Scanner *s, TSLexer *lexer) { + size_t size = 1; + if (lexer->lookahead == '\t') { + size = 4 - s->column; + s->column = 0; + } else { + s->column = (s->column + 1) % 4; + } + lexer->advance(lexer, false); + return size; +} + +// Try to match the given block, i.e. consume all tokens that belong to the +// block. These are +// 1. indentation for list items and indented code blocks +// 2. '>' for block quotes +// Returns true if the block is matched and false otherwise +static bool match(Scanner *s, TSLexer *lexer, Block block) { + switch (block) { + case INDENTED_CODE_BLOCK: + while (s->indentation < 4) { + if (lexer->lookahead == ' ' || lexer->lookahead == '\t') { + s->indentation += advance(s, lexer); + } else { + break; + } + } + if (s->indentation >= 4 && lexer->lookahead != '\n' && + lexer->lookahead != '\r') { + s->indentation -= 4; + return true; + } + break; + case LIST_ITEM: + case LIST_ITEM_1_INDENTATION: + case LIST_ITEM_2_INDENTATION: + case LIST_ITEM_3_INDENTATION: + case LIST_ITEM_4_INDENTATION: + case LIST_ITEM_5_INDENTATION: + case LIST_ITEM_6_INDENTATION: + case LIST_ITEM_7_INDENTATION: + case LIST_ITEM_8_INDENTATION: + case LIST_ITEM_9_INDENTATION: + case LIST_ITEM_10_INDENTATION: + case LIST_ITEM_11_INDENTATION: + case LIST_ITEM_12_INDENTATION: + case LIST_ITEM_13_INDENTATION: + case LIST_ITEM_14_INDENTATION: + case LIST_ITEM_MAX_INDENTATION: + while (s->indentation < list_item_indentation(block)) { + if (lexer->lookahead == ' ' || lexer->lookahead == '\t') { + s->indentation += advance(s, lexer); + } else { + break; + } + } + if (s->indentation >= list_item_indentation(block)) { + s->indentation -= list_item_indentation(block); + return true; + } + if (lexer->lookahead == '\n' || lexer->lookahead == '\r') { + s->indentation = 0; + return true; + } + break; + case BLOCK_QUOTE: + while (lexer->lookahead == ' ' || lexer->lookahead == '\t') { + s->indentation += advance(s, lexer); + } + if (lexer->lookahead == '>') { + advance(s, lexer); + s->indentation = 0; + if (lexer->lookahead == ' ' || lexer->lookahead == '\t') { + s->indentation += advance(s, lexer) - 1; + } + return true; + } + break; + case FENCED_CODE_BLOCK: + case ANONYMOUS: + return true; + } + return false; +} + +static bool parse_fenced_code_block(Scanner *s, const char delimiter, + TSLexer *lexer, const bool *valid_symbols) { + // count the number of backticks + uint8_t level = 0; + while (lexer->lookahead == delimiter) { + advance(s, lexer); + level++; + } + mark_end(s, lexer); + // If this is able to close a fenced code block then that is the only valid + // interpretation. It can only close a fenced code block if the number of + // backticks is at least the number of backticks of the opening delimiter. + // Also it cannot be indented more than 3 spaces. + if ((delimiter == '`' ? valid_symbols[FENCED_CODE_BLOCK_END_BACKTICK] + : valid_symbols[FENCED_CODE_BLOCK_END_TILDE]) && + s->indentation < 4 && level >= s->fenced_code_block_delimiter_length && + (lexer->lookahead == '\n' || lexer->lookahead == '\r')) { + s->fenced_code_block_delimiter_length = 0; + lexer->result_symbol = delimiter == '`' ? FENCED_CODE_BLOCK_END_BACKTICK + : FENCED_CODE_BLOCK_END_TILDE; + return true; + } + // If this could be the start of a fenced code block, check if the info + // string contains any backticks. + if ((delimiter == '`' ? valid_symbols[FENCED_CODE_BLOCK_START_BACKTICK] + : valid_symbols[FENCED_CODE_BLOCK_START_TILDE]) && + level >= 3) { + bool info_string_has_backtick = false; + if (delimiter == '`') { + while (lexer->lookahead != '\n' && lexer->lookahead != '\r' && + !lexer->eof(lexer)) { + if (lexer->lookahead == '`') { + info_string_has_backtick = true; + break; + } + advance(s, lexer); + } + } + // If it does not then choose to interpret this as the start of a fenced + // code block. + if (!info_string_has_backtick) { + lexer->result_symbol = delimiter == '`' + ? FENCED_CODE_BLOCK_START_BACKTICK + : FENCED_CODE_BLOCK_START_TILDE; + if (!s->simulate) + push_block(s, FENCED_CODE_BLOCK); + // Remember the length of the delimiter for later, since we need it + // to decide whether a sequence of backticks can close the block. + s->fenced_code_block_delimiter_length = level; + s->indentation = 0; + return true; + } + } + return false; +} + +static bool parse_star(Scanner *s, TSLexer *lexer, const bool *valid_symbols) { + advance(s, lexer); + mark_end(s, lexer); + // Otherwise count the number of stars permitting whitespaces between them. + size_t star_count = 1; + // Also remember how many stars there are before the first whitespace... + // ...and how many spaces follow the first star. + uint8_t extra_indentation = 0; + for (;;) { + if (lexer->lookahead == '*') { + if (star_count == 1 && extra_indentation >= 1 && + valid_symbols[LIST_MARKER_STAR]) { + // If we get to this point then the token has to be at least + // this long. We need to call `mark_end` here in case we decide + // later that this is a list item. + mark_end(s, lexer); + } + star_count++; + advance(s, lexer); + } else if (lexer->lookahead == ' ' || lexer->lookahead == '\t') { + if (star_count == 1) { + extra_indentation += advance(s, lexer); + } else { + advance(s, lexer); + } + } else { + break; + } + } + bool line_end = lexer->lookahead == '\n' || lexer->lookahead == '\r'; + bool dont_interrupt = false; + if (star_count == 1 && line_end) { + extra_indentation = 1; + // line is empty so don't interrupt paragraphs if this is a list marker + dont_interrupt = s->matched == s->open_blocks.size; + } + // If there were at least 3 stars then this could be a thematic break + bool thematic_break = star_count >= 3 && line_end; + // If there was a star and at least one space after that star then this + // could be a list marker. + bool list_marker_star = star_count >= 1 && extra_indentation >= 1; + if (valid_symbols[THEMATIC_BREAK] && thematic_break && s->indentation < 4) { + // If a thematic break is valid then it takes precedence + lexer->result_symbol = THEMATIC_BREAK; + mark_end(s, lexer); + s->indentation = 0; + return true; + } + if ((dont_interrupt ? valid_symbols[LIST_MARKER_STAR_DONT_INTERRUPT] + : valid_symbols[LIST_MARKER_STAR]) && + list_marker_star) { + // List markers take precedence over emphasis markers + // If star_count > 1 then we already called mark_end at the right point. + // Otherwise the token should go until this point. + if (star_count == 1) { + mark_end(s, lexer); + } + // Not counting one space... + extra_indentation--; + // ... check if the list item begins with an indented code block + if (extra_indentation <= 3) { + // If not then calculate the indentation level of the list item + // content as indentation of list marker + indentation after list + // marker - 1 + extra_indentation += s->indentation; + s->indentation = 0; + } else { + // Otherwise the indentation level is just the indentation of the + // list marker. We keep the indentation after the list marker for + // later blocks. + uint8_t temp = s->indentation; + s->indentation = extra_indentation; + extra_indentation = temp; + } + if (!s->simulate) + push_block(s, (Block)(LIST_ITEM + extra_indentation)); + lexer->result_symbol = + dont_interrupt ? LIST_MARKER_STAR_DONT_INTERRUPT : LIST_MARKER_STAR; + return true; + } + return false; +} + +static bool parse_thematic_break_underscore(Scanner *s, TSLexer *lexer, + const bool *valid_symbols) { + advance(s, lexer); + mark_end(s, lexer); + size_t underscore_count = 1; + for (;;) { + if (lexer->lookahead == '_') { + underscore_count++; + advance(s, lexer); + } else if (lexer->lookahead == ' ' || lexer->lookahead == '\t') { + advance(s, lexer); + } else { + break; + } + } + bool line_end = lexer->lookahead == '\n' || lexer->lookahead == '\r'; + if (underscore_count >= 3 && line_end && valid_symbols[THEMATIC_BREAK]) { + lexer->result_symbol = THEMATIC_BREAK; + mark_end(s, lexer); + s->indentation = 0; + return true; + } + return false; +} + +static bool parse_block_quote(Scanner *s, TSLexer *lexer, + const bool *valid_symbols) { + if (valid_symbols[BLOCK_QUOTE_START]) { + advance(s, lexer); + s->indentation = 0; + if (lexer->lookahead == ' ' || lexer->lookahead == '\t') { + s->indentation += advance(s, lexer) - 1; + } + lexer->result_symbol = BLOCK_QUOTE_START; + if (!s->simulate) + push_block(s, BLOCK_QUOTE); + return true; + } + return false; +} + +static bool parse_atx_heading(Scanner *s, TSLexer *lexer, + const bool *valid_symbols) { + if (valid_symbols[ATX_H1_MARKER] && s->indentation <= 3) { + mark_end(s, lexer); + uint16_t level = 0; + while (lexer->lookahead == '#' && level <= 6) { + advance(s, lexer); + level++; + } + if (level <= 6 && + (lexer->lookahead == ' ' || lexer->lookahead == '\t' || + lexer->lookahead == '\n' || lexer->lookahead == '\r')) { + lexer->result_symbol = ATX_H1_MARKER + (level - 1); + s->indentation = 0; + mark_end(s, lexer); + return true; + } + } + return false; +} + +static bool parse_setext_underline(Scanner *s, TSLexer *lexer, + const bool *valid_symbols) { + if (valid_symbols[SETEXT_H1_UNDERLINE] && + s->matched == s->open_blocks.size) { + mark_end(s, lexer); + while (lexer->lookahead == '=') { + advance(s, lexer); + } + while (lexer->lookahead == ' ' || lexer->lookahead == '\t') { + advance(s, lexer); + } + if (lexer->lookahead == '\n' || lexer->lookahead == '\r') { + lexer->result_symbol = SETEXT_H1_UNDERLINE; + mark_end(s, lexer); + return true; + } + } + return false; +} + +static bool parse_plus(Scanner *s, TSLexer *lexer, const bool *valid_symbols) { + if (s->indentation <= 3 && + (valid_symbols[LIST_MARKER_PLUS] || + valid_symbols[LIST_MARKER_PLUS_DONT_INTERRUPT] || + valid_symbols[PLUS_METADATA])) { + advance(s, lexer); + if (valid_symbols[PLUS_METADATA] && lexer->lookahead == '+') { + advance(s, lexer); + if (lexer->lookahead != '+') { + return false; + } + advance(s, lexer); + while (lexer->lookahead == ' ' || lexer->lookahead == '\t') { + advance(s, lexer); + } + if (lexer->lookahead != '\n' && lexer->lookahead != '\r') { + return false; + } + for (;;) { + // advance over newline + if (lexer->lookahead == '\r') { + advance(s, lexer); + if (lexer->lookahead == '\n') { + advance(s, lexer); + } + } else { + advance(s, lexer); + } + // check for pluses + size_t plus_count = 0; + while (lexer->lookahead == '+') { + plus_count++; + advance(s, lexer); + } + if (plus_count == 3) { + // if exactly 3 check if next symbol (after eventual + // whitespace) is newline + while (lexer->lookahead == ' ' || + lexer->lookahead == '\t') { + advance(s, lexer); + } + if (lexer->lookahead == '\r' || lexer->lookahead == '\n') { + // if so also consume newline + if (lexer->lookahead == '\r') { + advance(s, lexer); + if (lexer->lookahead == '\n') { + advance(s, lexer); + } + } else { + advance(s, lexer); + } + mark_end(s, lexer); + lexer->result_symbol = PLUS_METADATA; + return true; + } + } + // otherwise consume rest of line + while (lexer->lookahead != '\n' && lexer->lookahead != '\r' && + !lexer->eof(lexer)) { + advance(s, lexer); + } + // if end of file is reached, then this is not metadata + if (lexer->eof(lexer)) { + break; + } + } + } else { + uint8_t extra_indentation = 0; + while (lexer->lookahead == ' ' || lexer->lookahead == '\t') { + extra_indentation += advance(s, lexer); + } + bool dont_interrupt = false; + if (lexer->lookahead == '\r' || lexer->lookahead == '\n') { + extra_indentation = 1; + dont_interrupt = true; + } + dont_interrupt = + dont_interrupt && s->matched == s->open_blocks.size; + if (extra_indentation >= 1 && + (dont_interrupt ? valid_symbols[LIST_MARKER_PLUS_DONT_INTERRUPT] + : valid_symbols[LIST_MARKER_PLUS])) { + lexer->result_symbol = dont_interrupt + ? LIST_MARKER_PLUS_DONT_INTERRUPT + : LIST_MARKER_PLUS; + extra_indentation--; + if (extra_indentation <= 3) { + extra_indentation += s->indentation; + s->indentation = 0; + } else { + uint8_t temp = s->indentation; + s->indentation = extra_indentation; + extra_indentation = temp; + } + if (!s->simulate) + push_block(s, (Block)(LIST_ITEM + extra_indentation)); + return true; + } + } + } + return false; +} + +static bool parse_ordered_list_marker(Scanner *s, TSLexer *lexer, + const bool *valid_symbols) { + if (s->indentation <= 3 && + (valid_symbols[LIST_MARKER_PARENTHESIS] || + valid_symbols[LIST_MARKER_DOT] || + valid_symbols[LIST_MARKER_PARENTHESIS_DONT_INTERRUPT] || + valid_symbols[LIST_MARKER_DOT_DONT_INTERRUPT])) { + size_t digits = 1; + bool dont_interrupt = lexer->lookahead != '1'; + advance(s, lexer); + while (isdigit(lexer->lookahead)) { + dont_interrupt = true; + digits++; + advance(s, lexer); + } + if (digits >= 1 && digits <= 9) { + bool dot = false; + bool parenthesis = false; + if (lexer->lookahead == '.') { + advance(s, lexer); + dot = true; + } else if (lexer->lookahead == ')') { + advance(s, lexer); + parenthesis = true; + } + if (dot || parenthesis) { + uint8_t extra_indentation = 0; + while (lexer->lookahead == ' ' || lexer->lookahead == '\t') { + extra_indentation += advance(s, lexer); + } + bool line_end = + lexer->lookahead == '\n' || lexer->lookahead == '\r'; + if (line_end) { + extra_indentation = 1; + dont_interrupt = true; + } + dont_interrupt = + dont_interrupt && s->matched == s->open_blocks.size; + if (extra_indentation >= 1 && + (dot ? (dont_interrupt + ? valid_symbols[LIST_MARKER_DOT_DONT_INTERRUPT] + : valid_symbols[LIST_MARKER_DOT]) + : (dont_interrupt + ? valid_symbols + [LIST_MARKER_PARENTHESIS_DONT_INTERRUPT] + : valid_symbols[LIST_MARKER_PARENTHESIS]))) { + lexer->result_symbol = + dot ? LIST_MARKER_DOT : LIST_MARKER_PARENTHESIS; + extra_indentation--; + if (extra_indentation <= 3) { + extra_indentation += s->indentation; + s->indentation = 0; + } else { + uint8_t temp = s->indentation; + s->indentation = extra_indentation; + extra_indentation = temp; + } + if (!s->simulate) + push_block( + s, (Block)(LIST_ITEM + extra_indentation + digits)); + return true; + } + } + } + } + return false; +} + +static bool parse_minus(Scanner *s, TSLexer *lexer, const bool *valid_symbols) { + if (s->indentation <= 3 && + (valid_symbols[LIST_MARKER_MINUS] || + valid_symbols[LIST_MARKER_MINUS_DONT_INTERRUPT] || + valid_symbols[SETEXT_H2_UNDERLINE] || valid_symbols[THEMATIC_BREAK] || + valid_symbols[MINUS_METADATA])) { + mark_end(s, lexer); + bool whitespace_after_minus = false; + bool minus_after_whitespace = false; + size_t minus_count = 0; + uint8_t extra_indentation = 0; + + for (;;) { + if (lexer->lookahead == '-') { + if (minus_count == 1 && extra_indentation >= 1) { + mark_end(s, lexer); + } + minus_count++; + advance(s, lexer); + minus_after_whitespace = whitespace_after_minus; + } else if (lexer->lookahead == ' ' || lexer->lookahead == '\t') { + if (minus_count == 1) { + extra_indentation += advance(s, lexer); + } else { + advance(s, lexer); + } + whitespace_after_minus = true; + } else { + break; + } + } + bool line_end = lexer->lookahead == '\n' || lexer->lookahead == '\r'; + bool dont_interrupt = false; + if (minus_count == 1 && line_end) { + extra_indentation = 1; + dont_interrupt = true; + } + dont_interrupt = dont_interrupt && s->matched == s->open_blocks.size; + bool thematic_break = minus_count >= 3 && line_end; + bool underline = + minus_count >= 1 && !minus_after_whitespace && line_end && + s->matched == + s->open_blocks + .size; // setext heading can not break lazy continuation + bool list_marker_minus = minus_count >= 1 && extra_indentation >= 1; + bool success = false; + if (valid_symbols[SETEXT_H2_UNDERLINE] && underline) { + lexer->result_symbol = SETEXT_H2_UNDERLINE; + mark_end(s, lexer); + s->indentation = 0; + success = true; + } else if (valid_symbols[THEMATIC_BREAK] && + thematic_break) { // underline is false if list_marker_minus + // is true + lexer->result_symbol = THEMATIC_BREAK; + mark_end(s, lexer); + s->indentation = 0; + success = true; + } else if ((dont_interrupt + ? valid_symbols[LIST_MARKER_MINUS_DONT_INTERRUPT] + : valid_symbols[LIST_MARKER_MINUS]) && + list_marker_minus) { + if (minus_count == 1) { + mark_end(s, lexer); + } + extra_indentation--; + if (extra_indentation <= 3) { + extra_indentation += s->indentation; + s->indentation = 0; + } else { + uint8_t temp = s->indentation; + s->indentation = extra_indentation; + extra_indentation = temp; + } + if (!s->simulate) + push_block(s, (Block)(LIST_ITEM + extra_indentation)); + lexer->result_symbol = dont_interrupt + ? LIST_MARKER_MINUS_DONT_INTERRUPT + : LIST_MARKER_MINUS; + return true; + } + if (minus_count == 3 && (!minus_after_whitespace) && line_end && + valid_symbols[MINUS_METADATA]) { + for (;;) { + // advance over newline + if (lexer->lookahead == '\r') { + advance(s, lexer); + if (lexer->lookahead == '\n') { + advance(s, lexer); + } + } else { + advance(s, lexer); + } + // check for minuses + minus_count = 0; + while (lexer->lookahead == '-') { + minus_count++; + advance(s, lexer); + } + if (minus_count == 3) { + // if exactly 3 check if next symbol (after eventual + // whitespace) is newline + while (lexer->lookahead == ' ' || + lexer->lookahead == '\t') { + advance(s, lexer); + } + if (lexer->lookahead == '\r' || lexer->lookahead == '\n') { + // if so also consume newline + if (lexer->lookahead == '\r') { + advance(s, lexer); + if (lexer->lookahead == '\n') { + advance(s, lexer); + } + } else { + advance(s, lexer); + } + mark_end(s, lexer); + lexer->result_symbol = MINUS_METADATA; + return true; + } + } + // otherwise consume rest of line + while (lexer->lookahead != '\n' && lexer->lookahead != '\r' && + !lexer->eof(lexer)) { + advance(s, lexer); + } + // if end of file is reached, then this is not metadata + if (lexer->eof(lexer)) { + break; + } + } + } + if (success) { + return true; + } + } + return false; +} + +static bool parse_html_block(Scanner *s, TSLexer *lexer, + const bool *valid_symbols) { + if (!(valid_symbols[HTML_BLOCK_1_START] || + valid_symbols[HTML_BLOCK_1_END] || + valid_symbols[HTML_BLOCK_2_START] || + valid_symbols[HTML_BLOCK_3_START] || + valid_symbols[HTML_BLOCK_4_START] || + valid_symbols[HTML_BLOCK_5_START] || + valid_symbols[HTML_BLOCK_6_START] || + valid_symbols[HTML_BLOCK_7_START])) { + return false; + } + advance(s, lexer); + if (lexer->lookahead == '?' && valid_symbols[HTML_BLOCK_3_START]) { + advance(s, lexer); + lexer->result_symbol = HTML_BLOCK_3_START; + if (!s->simulate) + push_block(s, ANONYMOUS); + return true; + } + if (lexer->lookahead == '!') { + // could be block 2 + advance(s, lexer); + if (lexer->lookahead == '-') { + advance(s, lexer); + if (lexer->lookahead == '-' && valid_symbols[HTML_BLOCK_2_START]) { + advance(s, lexer); + lexer->result_symbol = HTML_BLOCK_2_START; + if (!s->simulate) + push_block(s, ANONYMOUS); + return true; + } + } else if ('A' <= lexer->lookahead && lexer->lookahead <= 'Z' && + valid_symbols[HTML_BLOCK_4_START]) { + advance(s, lexer); + lexer->result_symbol = HTML_BLOCK_4_START; + if (!s->simulate) + push_block(s, ANONYMOUS); + return true; + } else if (lexer->lookahead == '[') { + advance(s, lexer); + if (lexer->lookahead == 'C') { + advance(s, lexer); + if (lexer->lookahead == 'D') { + advance(s, lexer); + if (lexer->lookahead == 'A') { + advance(s, lexer); + if (lexer->lookahead == 'T') { + advance(s, lexer); + if (lexer->lookahead == 'A') { + advance(s, lexer); + if (lexer->lookahead == '[' && + valid_symbols[HTML_BLOCK_5_START]) { + advance(s, lexer); + lexer->result_symbol = HTML_BLOCK_5_START; + if (!s->simulate) + push_block(s, ANONYMOUS); + return true; + } + } + } + } + } + } + } + } + bool starting_slash = lexer->lookahead == '/'; + if (starting_slash) { + advance(s, lexer); + } + char name[11]; + size_t name_length = 0; + while (iswalpha((wint_t)lexer->lookahead)) { + if (name_length < 10) { + name[name_length++] = (char)towlower((wint_t)lexer->lookahead); + } else { + name_length = 12; + } + advance(s, lexer); + } + if (name_length == 0) { + return false; + } + bool tag_closed = false; + if (name_length < 11) { + name[name_length] = 0; + bool next_symbol_valid = + lexer->lookahead == ' ' || lexer->lookahead == '\t' || + lexer->lookahead == '\n' || lexer->lookahead == '\r' || + lexer->lookahead == '>'; + if (next_symbol_valid) { + // try block 1 names + for (size_t i = 0; i < NUM_HTML_TAG_NAMES_RULE_1; i++) { + if (strcmp(name, HTML_TAG_NAMES_RULE_1[i]) == 0) { + if (starting_slash) { + if (valid_symbols[HTML_BLOCK_1_END]) { + lexer->result_symbol = HTML_BLOCK_1_END; + return true; + } + } else if (valid_symbols[HTML_BLOCK_1_START]) { + lexer->result_symbol = HTML_BLOCK_1_START; + if (!s->simulate) + push_block(s, ANONYMOUS); + return true; + } + } + } + } + if (!next_symbol_valid && lexer->lookahead == '/') { + advance(s, lexer); + if (lexer->lookahead == '>') { + advance(s, lexer); + tag_closed = true; + } + } + if (next_symbol_valid || tag_closed) { + // try block 2 names + for (size_t i = 0; i < NUM_HTML_TAG_NAMES_RULE_7; i++) { + if (strcmp(name, HTML_TAG_NAMES_RULE_7[i]) == 0 && + valid_symbols[HTML_BLOCK_6_START]) { + lexer->result_symbol = HTML_BLOCK_6_START; + if (!s->simulate) + push_block(s, ANONYMOUS); + return true; + } + } + } + } + + if (!valid_symbols[HTML_BLOCK_7_START]) { + return false; + } + + if (!tag_closed) { + // tag name (continued) + while (iswalnum((wint_t)lexer->lookahead) || lexer->lookahead == '-') { + advance(s, lexer); + } + if (!starting_slash) { + // attributes + bool had_whitespace = false; + for (;;) { + // whitespace + while (lexer->lookahead == ' ' || lexer->lookahead == '\t') { + had_whitespace = true; + advance(s, lexer); + } + if (lexer->lookahead == '/') { + advance(s, lexer); + break; + } + if (lexer->lookahead == '>') { + break; + } + // attribute name + if (!had_whitespace) { + return false; + } + if (!iswalpha((wint_t)lexer->lookahead) && + lexer->lookahead != '_' && lexer->lookahead != ':') { + return false; + } + had_whitespace = false; + advance(s, lexer); + while (iswalnum((wint_t)lexer->lookahead) || + lexer->lookahead == '_' || lexer->lookahead == '.' || + lexer->lookahead == ':' || lexer->lookahead == '-') { + advance(s, lexer); + } + // attribute value specification + // optional whitespace + while (lexer->lookahead == ' ' || lexer->lookahead == '\t') { + had_whitespace = true; + advance(s, lexer); + } + // = + if (lexer->lookahead == '=') { + advance(s, lexer); + had_whitespace = false; + // optional whitespace + while (lexer->lookahead == ' ' || + lexer->lookahead == '\t') { + advance(s, lexer); + } + // attribute value + if (lexer->lookahead == '\'' || lexer->lookahead == '"') { + char delimiter = (char)lexer->lookahead; + advance(s, lexer); + while (lexer->lookahead != delimiter && + lexer->lookahead != '\n' && + lexer->lookahead != '\r' && !lexer->eof(lexer)) { + advance(s, lexer); + } + if (lexer->lookahead != delimiter) { + return false; + } + advance(s, lexer); + } else { + // unquoted attribute value + bool had_one = false; + while (lexer->lookahead != ' ' && + lexer->lookahead != '\t' && + lexer->lookahead != '"' && + lexer->lookahead != '\'' && + lexer->lookahead != '=' && + lexer->lookahead != '<' && + lexer->lookahead != '>' && + lexer->lookahead != '`' && + lexer->lookahead != '\n' && + lexer->lookahead != '\r' && !lexer->eof(lexer)) { + advance(s, lexer); + had_one = true; + } + if (!had_one) { + return false; + } + } + } + } + } else { + while (lexer->lookahead == ' ' || lexer->lookahead == '\t') { + advance(s, lexer); + } + } + if (lexer->lookahead != '>') { + return false; + } + advance(s, lexer); + } + while (lexer->lookahead == ' ' || lexer->lookahead == '\t') { + advance(s, lexer); + } + if (lexer->lookahead == '\r' || lexer->lookahead == '\n') { + lexer->result_symbol = HTML_BLOCK_7_START; + if (!s->simulate) + push_block(s, ANONYMOUS); + return true; + } + return false; +} + +static bool parse_pipe_table(Scanner *s, TSLexer *lexer, + const bool *valid_symbols) { + + // unused + (void)(valid_symbols); + + // PIPE_TABLE_START is zero width + mark_end(s, lexer); + // count number of cells + size_t cell_count = 0; + // also remember if we see starting and ending pipes, as empty headers have + // to have both + bool starting_pipe = false; + bool ending_pipe = false; + bool empty = true; + if (lexer->lookahead == '|') { + starting_pipe = true; + advance(s, lexer); + } + while (lexer->lookahead != '\r' && lexer->lookahead != '\n' && + !lexer->eof(lexer)) { + if (lexer->lookahead == '|') { + cell_count++; + ending_pipe = true; + advance(s, lexer); + } else { + if (lexer->lookahead != ' ' && lexer->lookahead != '\t') { + ending_pipe = false; + } + if (lexer->lookahead == '\\') { + advance(s, lexer); + if (is_punctuation((char)lexer->lookahead)) { + advance(s, lexer); + } + } else { + advance(s, lexer); + } + } + } + if (empty && cell_count == 0 && !(starting_pipe && ending_pipe)) { + return false; + } + if (!ending_pipe) { + cell_count++; + } + + // check the following line for a delimiter row + // parse a newline + if (lexer->lookahead == '\n') { + advance(s, lexer); + } else if (lexer->lookahead == '\r') { + advance(s, lexer); + if (lexer->lookahead == '\n') { + advance(s, lexer); + } + } else { + return false; + } + s->indentation = 0; + s->column = 0; + for (;;) { + if (lexer->lookahead == ' ' || lexer->lookahead == '\t') { + s->indentation += advance(s, lexer); + } else { + break; + } + } + s->simulate = true; + uint8_t matched_temp = 0; + while (matched_temp < (uint8_t)s->open_blocks.size) { + if (match(s, lexer, s->open_blocks.items[matched_temp])) { + matched_temp++; + } else { + return false; + } + } + + // check if delimiter row has the same number of cells and at least one pipe + size_t delimiter_cell_count = 0; + if (lexer->lookahead == '|') { + advance(s, lexer); + } + for (;;) { + while (lexer->lookahead == ' ' || lexer->lookahead == '\t') { + advance(s, lexer); + } + if (lexer->lookahead == '|') { + delimiter_cell_count++; + advance(s, lexer); + continue; + } + if (lexer->lookahead == ':') { + advance(s, lexer); + if (lexer->lookahead != '-') { + return false; + } + } + bool had_one_minus = false; + while (lexer->lookahead == '-') { + had_one_minus = true; + advance(s, lexer); + } + if (had_one_minus) { + delimiter_cell_count++; + } + if (lexer->lookahead == ':') { + if (!had_one_minus) { + return false; + } + advance(s, lexer); + } + while (lexer->lookahead == ' ' || lexer->lookahead == '\t') { + advance(s, lexer); + } + if (lexer->lookahead == '|') { + if (!had_one_minus) { + delimiter_cell_count++; + } + advance(s, lexer); + continue; + } + if (lexer->lookahead != '\r' && lexer->lookahead != '\n') { + return false; + } else { + break; + } + } + // if the cell counts are not equal then this is not a table + if (cell_count != delimiter_cell_count) { + return false; + } + + lexer->result_symbol = PIPE_TABLE_START; + return true; +} + +static bool scan(Scanner *s, TSLexer *lexer, const bool *valid_symbols) { + // A normal tree-sitter rule decided that the current branch is invalid and + // now "requests" an error to stop the branch + if (valid_symbols[TRIGGER_ERROR]) { + return error(lexer); + } + + // Close the inner most block after the next line break as requested. See + // `$._close_block` in grammar.js + if (valid_symbols[CLOSE_BLOCK]) { + s->state |= STATE_CLOSE_BLOCK; + lexer->result_symbol = CLOSE_BLOCK; + return true; + } + + // if we are at the end of the file and there are still open blocks close + // them all + if (lexer->eof(lexer)) { + if (valid_symbols[TOKEN_EOF]) { + lexer->result_symbol = TOKEN_EOF; + return true; + } + if (s->open_blocks.size > 0) { + lexer->result_symbol = BLOCK_CLOSE; + if (!s->simulate) + pop_block(s); + return true; + } + return false; + } + + if (!(s->state & STATE_MATCHING)) { + // Parse any preceeding whitespace and remember its length. This makes a + // lot of parsing quite a bit easier. + for (;;) { + if (lexer->lookahead == ' ' || lexer->lookahead == '\t') { + s->indentation += advance(s, lexer); + } else { + break; + } + } + // We are not matching. This is where the parsing logic for most + // "normal" token is. Most importantly parsing logic for the start of + // new blocks. + if (valid_symbols[INDENTED_CHUNK_START] && + !valid_symbols[NO_INDENTED_CHUNK]) { + if (s->indentation >= 4 && lexer->lookahead != '\n' && + lexer->lookahead != '\r') { + lexer->result_symbol = INDENTED_CHUNK_START; + if (!s->simulate) + push_block(s, INDENTED_CODE_BLOCK); + s->indentation -= 4; + return true; + } + } + // Decide which tokens to consider based on the first non-whitespace + // character + switch (lexer->lookahead) { + case '\r': + case '\n': + if (valid_symbols[BLANK_LINE_START]) { + // A blank line token is actually just 0 width, so do not + // consume the characters + lexer->result_symbol = BLANK_LINE_START; + return true; + } + break; + case '`': + // A backtick could mark the beginning or ending of a fenced + // code block. + return parse_fenced_code_block(s, '`', lexer, valid_symbols); + case '~': + // A tilde could mark the beginning or ending of a fenced code + // block. + return parse_fenced_code_block(s, '~', lexer, valid_symbols); + case '*': + // A star could either mark a list item or a thematic break. + // This code is similar to the code for '_' and '+'. + return parse_star(s, lexer, valid_symbols); + case '_': + return parse_thematic_break_underscore(s, lexer, valid_symbols); + case '>': + // A '>' could mark the beginning of a block quote + return parse_block_quote(s, lexer, valid_symbols); + case '#': + // A '#' could mark a atx heading + return parse_atx_heading(s, lexer, valid_symbols); + case '=': + // A '=' could mark a setext underline + return parse_setext_underline(s, lexer, valid_symbols); + case '+': + // A '+' could be a list marker + return parse_plus(s, lexer, valid_symbols); + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + // A number could be a list marker (if followed by a dot or a + // parenthesis) + return parse_ordered_list_marker(s, lexer, valid_symbols); + case '-': + // A minus could mark a list marker, a thematic break or a + // setext underline + return parse_minus(s, lexer, valid_symbols); + case '<': + // A < could mark the beginning of a html block + return parse_html_block(s, lexer, valid_symbols); + } + if (lexer->lookahead != '\r' && lexer->lookahead != '\n' && + valid_symbols[PIPE_TABLE_START]) { + return parse_pipe_table(s, lexer, valid_symbols); + } + } else { // we are in the state of trying to match all currently open blocks + bool partial_success = false; + while (s->matched < (uint8_t)s->open_blocks.size) { + if (s->matched == (uint8_t)s->open_blocks.size - 1 && + (s->state & STATE_CLOSE_BLOCK)) { + if (!partial_success) + s->state &= ~STATE_CLOSE_BLOCK; + break; + } + if (match(s, lexer, s->open_blocks.items[s->matched])) { + partial_success = true; + s->matched++; + } else { + if (s->state & STATE_WAS_SOFT_LINE_BREAK) { + s->state &= (~STATE_MATCHING); + } + break; + } + } + if (partial_success) { + if (s->matched == s->open_blocks.size) { + s->state &= (~STATE_MATCHING); + } + lexer->result_symbol = BLOCK_CONTINUATION; + return true; + } + + if (!(s->state & STATE_WAS_SOFT_LINE_BREAK)) { + lexer->result_symbol = BLOCK_CLOSE; + pop_block(s); + if (s->matched == s->open_blocks.size) { + s->state &= (~STATE_MATCHING); + } + return true; + } + } + + // The parser just encountered a line break. Setup the state correspondingly + if ((valid_symbols[LINE_ENDING] || valid_symbols[SOFT_LINE_ENDING] || + valid_symbols[PIPE_TABLE_LINE_ENDING]) && + (lexer->lookahead == '\n' || lexer->lookahead == '\r')) { + if (lexer->lookahead == '\r') { + advance(s, lexer); + if (lexer->lookahead == '\n') { + advance(s, lexer); + } + } else { + advance(s, lexer); + } + s->indentation = 0; + s->column = 0; + if (!(s->state & STATE_CLOSE_BLOCK) && + (valid_symbols[SOFT_LINE_ENDING] || + valid_symbols[PIPE_TABLE_LINE_ENDING])) { + lexer->mark_end(lexer); + for (;;) { + if (lexer->lookahead == ' ' || lexer->lookahead == '\t') { + s->indentation += advance(s, lexer); + } else { + break; + } + } + s->simulate = true; + uint8_t matched_temp = s->matched; + s->matched = 0; + bool one_will_be_matched = false; + while (s->matched < (uint8_t)s->open_blocks.size) { + if (match(s, lexer, s->open_blocks.items[s->matched])) { + s->matched++; + one_will_be_matched = true; + } else { + break; + } + } + bool all_will_be_matched = s->matched == s->open_blocks.size; + if (!lexer->eof(lexer) && + !scan(s, lexer, paragraph_interrupt_symbols)) { + s->matched = matched_temp; + // If the last line break ended a paragraph and no new block + // opened, the last line break should have been a soft line + // break Reset the counter for matched blocks + s->matched = 0; + s->indentation = 0; + s->column = 0; + // If there is at least one open block, we should be in the + // matching state. Also set the matching flag if a + // `$._soft_line_break_marker` can be emitted so it does get + // emitted. + if (one_will_be_matched) { + s->state |= STATE_MATCHING; + } else { + s->state &= (~STATE_MATCHING); + } + if (valid_symbols[PIPE_TABLE_LINE_ENDING]) { + if (all_will_be_matched) { + lexer->result_symbol = PIPE_TABLE_LINE_ENDING; + return true; + } + } else { + lexer->result_symbol = SOFT_LINE_ENDING; + // reset some state variables + s->state |= STATE_WAS_SOFT_LINE_BREAK; + return true; + } + } else { + s->matched = matched_temp; + } + s->indentation = 0; + s->column = 0; + } + if (valid_symbols[LINE_ENDING]) { + // If the last line break ended a paragraph and no new block opened, + // the last line break should have been a soft line break Reset the + // counter for matched blocks + s->matched = 0; + // If there is at least one open block, we should be in the matching + // state. Also set the matching flag if a + // `$._soft_line_break_marker` can be emitted so it does get + // emitted. + if (s->open_blocks.size > 0) { + s->state |= STATE_MATCHING; + } else { + s->state &= (~STATE_MATCHING); + } + // reset some state variables + s->state &= (~STATE_WAS_SOFT_LINE_BREAK); + lexer->result_symbol = LINE_ENDING; + return true; + } + } + return false; +} + +void *tree_sitter_markdown_external_scanner_create(void) { + Scanner *s = (Scanner *)malloc(sizeof(Scanner)); + s->open_blocks.items = (Block *)calloc(1, sizeof(Block)); +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) + _Static_assert(ATX_H6_MARKER == ATX_H1_MARKER + 5, ""); +#else + assert(ATX_H6_MARKER == ATX_H1_MARKER + 5); +#endif + deserialize(s, NULL, 0); + + return s; +} + +bool tree_sitter_markdown_external_scanner_scan(void *payload, TSLexer *lexer, + const bool *valid_symbols) { + Scanner *scanner = (Scanner *)payload; + scanner->simulate = false; + return scan(scanner, lexer, valid_symbols); +} + +unsigned tree_sitter_markdown_external_scanner_serialize(void *payload, + char *buffer) { + Scanner *scanner = (Scanner *)payload; + return serialize(scanner, buffer); +} + +void tree_sitter_markdown_external_scanner_deserialize(void *payload, + char *buffer, + unsigned length) { + Scanner *scanner = (Scanner *)payload; + deserialize(scanner, buffer, length); +} + +void tree_sitter_markdown_external_scanner_destroy(void *payload) { + Scanner *scanner = (Scanner *)payload; + free(scanner->open_blocks.items); + free(scanner); +} diff --git a/test-grammars/markdown/src/tree_sitter/alloc.h b/test-grammars/markdown/src/tree_sitter/alloc.h new file mode 100644 index 0000000..1f4466d --- /dev/null +++ b/test-grammars/markdown/src/tree_sitter/alloc.h @@ -0,0 +1,54 @@ +#ifndef TREE_SITTER_ALLOC_H_ +#define TREE_SITTER_ALLOC_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +// Allow clients to override allocation functions +#ifdef TREE_SITTER_REUSE_ALLOCATOR + +extern void *(*ts_current_malloc)(size_t); +extern void *(*ts_current_calloc)(size_t, size_t); +extern void *(*ts_current_realloc)(void *, size_t); +extern void (*ts_current_free)(void *); + +#ifndef ts_malloc +#define ts_malloc ts_current_malloc +#endif +#ifndef ts_calloc +#define ts_calloc ts_current_calloc +#endif +#ifndef ts_realloc +#define ts_realloc ts_current_realloc +#endif +#ifndef ts_free +#define ts_free ts_current_free +#endif + +#else + +#ifndef ts_malloc +#define ts_malloc malloc +#endif +#ifndef ts_calloc +#define ts_calloc calloc +#endif +#ifndef ts_realloc +#define ts_realloc realloc +#endif +#ifndef ts_free +#define ts_free free +#endif + +#endif + +#ifdef __cplusplus +} +#endif + +#endif // TREE_SITTER_ALLOC_H_ diff --git a/test-grammars/markdown/src/tree_sitter/array.h b/test-grammars/markdown/src/tree_sitter/array.h new file mode 100644 index 0000000..186ba67 --- /dev/null +++ b/test-grammars/markdown/src/tree_sitter/array.h @@ -0,0 +1,287 @@ +#ifndef TREE_SITTER_ARRAY_H_ +#define TREE_SITTER_ARRAY_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "./alloc.h" + +#include +#include +#include +#include +#include + +#ifdef _MSC_VER +#pragma warning(disable : 4101) +#elif defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-variable" +#endif + +#define Array(T) \ + struct { \ + T *contents; \ + uint32_t size; \ + uint32_t capacity; \ + } + +/// Initialize an array. +#define array_init(self) \ + ((self)->size = 0, (self)->capacity = 0, (self)->contents = NULL) + +/// Create an empty array. +#define array_new() \ + { NULL, 0, 0 } + +/// Get a pointer to the element at a given `index` in the array. +#define array_get(self, _index) \ + (assert((uint32_t)(_index) < (self)->size), &(self)->contents[_index]) + +/// Get a pointer to the first element in the array. +#define array_front(self) array_get(self, 0) + +/// Get a pointer to the last element in the array. +#define array_back(self) array_get(self, (self)->size - 1) + +/// Clear the array, setting its size to zero. Note that this does not free any +/// memory allocated for the array's contents. +#define array_clear(self) ((self)->size = 0) + +/// Reserve `new_capacity` elements of space in the array. If `new_capacity` is +/// less than the array's current capacity, this function has no effect. +#define array_reserve(self, new_capacity) \ + _array__reserve((Array *)(self), array_elem_size(self), new_capacity) + +/// Free any memory allocated for this array. Note that this does not free any +/// memory allocated for the array's contents. +#define array_delete(self) _array__delete((Array *)(self)) + +/// Push a new `element` onto the end of the array. +#define array_push(self, element) \ + (_array__grow((Array *)(self), 1, array_elem_size(self)), \ + (self)->contents[(self)->size++] = (element)) + +/// Increase the array's size by `count` elements. +/// New elements are zero-initialized. +#define array_grow_by(self, count) \ + (_array__grow((Array *)(self), count, array_elem_size(self)), \ + memset((self)->contents + (self)->size, 0, (count) * array_elem_size(self)), \ + (self)->size += (count)) + +/// Append all elements from one array to the end of another. +#define array_push_all(self, other) \ + array_extend((self), (other)->size, (other)->contents) + +/// Append `count` elements to the end of the array, reading their values from the +/// `contents` pointer. +#define array_extend(self, count, contents) \ + _array__splice( \ + (Array *)(self), array_elem_size(self), (self)->size, \ + 0, count, contents \ + ) + +/// Remove `old_count` elements from the array starting at the given `index`. At +/// the same index, insert `new_count` new elements, reading their values from the +/// `new_contents` pointer. +#define array_splice(self, _index, old_count, new_count, new_contents) \ + _array__splice( \ + (Array *)(self), array_elem_size(self), _index, \ + old_count, new_count, new_contents \ + ) + +/// Insert one `element` into the array at the given `index`. +#define array_insert(self, _index, element) \ + _array__splice((Array *)(self), array_elem_size(self), _index, 0, 1, &(element)) + +/// Remove one element from the array at the given `index`. +#define array_erase(self, _index) \ + _array__erase((Array *)(self), array_elem_size(self), _index) + +/// Pop the last element off the array, returning the element by value. +#define array_pop(self) ((self)->contents[--(self)->size]) + +/// Assign the contents of one array to another, reallocating if necessary. +#define array_assign(self, other) \ + _array__assign((Array *)(self), (const Array *)(other), array_elem_size(self)) + +/// Swap one array with another +#define array_swap(self, other) \ + _array__swap((Array *)(self), (Array *)(other)) + +/// Get the size of the array contents +#define array_elem_size(self) (sizeof *(self)->contents) + +/// Search a sorted array for a given `needle` value, using the given `compare` +/// callback to determine the order. +/// +/// If an existing element is found to be equal to `needle`, then the `index` +/// out-parameter is set to the existing value's index, and the `exists` +/// out-parameter is set to true. Otherwise, `index` is set to an index where +/// `needle` should be inserted in order to preserve the sorting, and `exists` +/// is set to false. +#define array_search_sorted_with(self, compare, needle, _index, _exists) \ + _array__search_sorted(self, 0, compare, , needle, _index, _exists) + +/// Search a sorted array for a given `needle` value, using integer comparisons +/// of a given struct field (specified with a leading dot) to determine the order. +/// +/// See also `array_search_sorted_with`. +#define array_search_sorted_by(self, field, needle, _index, _exists) \ + _array__search_sorted(self, 0, _compare_int, field, needle, _index, _exists) + +/// Insert a given `value` into a sorted array, using the given `compare` +/// callback to determine the order. +#define array_insert_sorted_with(self, compare, value) \ + do { \ + unsigned _index, _exists; \ + array_search_sorted_with(self, compare, &(value), &_index, &_exists); \ + if (!_exists) array_insert(self, _index, value); \ + } while (0) + +/// Insert a given `value` into a sorted array, using integer comparisons of +/// a given struct field (specified with a leading dot) to determine the order. +/// +/// See also `array_search_sorted_by`. +#define array_insert_sorted_by(self, field, value) \ + do { \ + unsigned _index, _exists; \ + array_search_sorted_by(self, field, (value) field, &_index, &_exists); \ + if (!_exists) array_insert(self, _index, value); \ + } while (0) + +// Private + +typedef Array(void) Array; + +/// This is not what you're looking for, see `array_delete`. +static inline void _array__delete(Array *self) { + if (self->contents) { + ts_free(self->contents); + self->contents = NULL; + self->size = 0; + self->capacity = 0; + } +} + +/// This is not what you're looking for, see `array_erase`. +static inline void _array__erase(Array *self, size_t element_size, + uint32_t index) { + assert(index < self->size); + char *contents = (char *)self->contents; + memmove(contents + index * element_size, contents + (index + 1) * element_size, + (self->size - index - 1) * element_size); + self->size--; +} + +/// This is not what you're looking for, see `array_reserve`. +static inline void _array__reserve(Array *self, size_t element_size, uint32_t new_capacity) { + if (new_capacity > self->capacity) { + if (self->contents) { + self->contents = ts_realloc(self->contents, new_capacity * element_size); + } else { + self->contents = ts_malloc(new_capacity * element_size); + } + self->capacity = new_capacity; + } +} + +/// This is not what you're looking for, see `array_assign`. +static inline void _array__assign(Array *self, const Array *other, size_t element_size) { + _array__reserve(self, element_size, other->size); + self->size = other->size; + memcpy(self->contents, other->contents, self->size * element_size); +} + +/// This is not what you're looking for, see `array_swap`. +static inline void _array__swap(Array *self, Array *other) { + Array swap = *other; + *other = *self; + *self = swap; +} + +/// This is not what you're looking for, see `array_push` or `array_grow_by`. +static inline void _array__grow(Array *self, uint32_t count, size_t element_size) { + uint32_t new_size = self->size + count; + if (new_size > self->capacity) { + uint32_t new_capacity = self->capacity * 2; + if (new_capacity < 8) new_capacity = 8; + if (new_capacity < new_size) new_capacity = new_size; + _array__reserve(self, element_size, new_capacity); + } +} + +/// This is not what you're looking for, see `array_splice`. +static inline void _array__splice(Array *self, size_t element_size, + uint32_t index, uint32_t old_count, + uint32_t new_count, const void *elements) { + uint32_t new_size = self->size + new_count - old_count; + uint32_t old_end = index + old_count; + uint32_t new_end = index + new_count; + assert(old_end <= self->size); + + _array__reserve(self, element_size, new_size); + + char *contents = (char *)self->contents; + if (self->size > old_end) { + memmove( + contents + new_end * element_size, + contents + old_end * element_size, + (self->size - old_end) * element_size + ); + } + if (new_count > 0) { + if (elements) { + memcpy( + (contents + index * element_size), + elements, + new_count * element_size + ); + } else { + memset( + (contents + index * element_size), + 0, + new_count * element_size + ); + } + } + self->size += new_count - old_count; +} + +/// A binary search routine, based on Rust's `std::slice::binary_search_by`. +/// This is not what you're looking for, see `array_search_sorted_with` or `array_search_sorted_by`. +#define _array__search_sorted(self, start, compare, suffix, needle, _index, _exists) \ + do { \ + *(_index) = start; \ + *(_exists) = false; \ + uint32_t size = (self)->size - *(_index); \ + if (size == 0) break; \ + int comparison; \ + while (size > 1) { \ + uint32_t half_size = size / 2; \ + uint32_t mid_index = *(_index) + half_size; \ + comparison = compare(&((self)->contents[mid_index] suffix), (needle)); \ + if (comparison <= 0) *(_index) = mid_index; \ + size -= half_size; \ + } \ + comparison = compare(&((self)->contents[*(_index)] suffix), (needle)); \ + if (comparison == 0) *(_exists) = true; \ + else if (comparison < 0) *(_index) += 1; \ + } while (0) + +/// Helper macro for the `_sorted_by` routines below. This takes the left (existing) +/// parameter by reference in order to work with the generic sorting function above. +#define _compare_int(a, b) ((int)*(a) - (int)(b)) + +#ifdef _MSC_VER +#pragma warning(default : 4101) +#elif defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic pop +#endif + +#ifdef __cplusplus +} +#endif + +#endif // TREE_SITTER_ARRAY_H_ diff --git a/test-grammars/markdown/src/tree_sitter/parser.h b/test-grammars/markdown/src/tree_sitter/parser.h new file mode 100644 index 0000000..17b4fde --- /dev/null +++ b/test-grammars/markdown/src/tree_sitter/parser.h @@ -0,0 +1,230 @@ +#ifndef TREE_SITTER_PARSER_H_ +#define TREE_SITTER_PARSER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +#define ts_builtin_sym_error ((TSSymbol)-1) +#define ts_builtin_sym_end 0 +#define TREE_SITTER_SERIALIZATION_BUFFER_SIZE 1024 + +#ifndef TREE_SITTER_API_H_ +typedef uint16_t TSStateId; +typedef uint16_t TSSymbol; +typedef uint16_t TSFieldId; +typedef struct TSLanguage TSLanguage; +#endif + +typedef struct { + TSFieldId field_id; + uint8_t child_index; + bool inherited; +} TSFieldMapEntry; + +typedef struct { + uint16_t index; + uint16_t length; +} TSFieldMapSlice; + +typedef struct { + bool visible; + bool named; + bool supertype; +} TSSymbolMetadata; + +typedef struct TSLexer TSLexer; + +struct TSLexer { + int32_t lookahead; + TSSymbol result_symbol; + void (*advance)(TSLexer *, bool); + void (*mark_end)(TSLexer *); + uint32_t (*get_column)(TSLexer *); + bool (*is_at_included_range_start)(const TSLexer *); + bool (*eof)(const TSLexer *); +}; + +typedef enum { + TSParseActionTypeShift, + TSParseActionTypeReduce, + TSParseActionTypeAccept, + TSParseActionTypeRecover, +} TSParseActionType; + +typedef union { + struct { + uint8_t type; + TSStateId state; + bool extra; + bool repetition; + } shift; + struct { + uint8_t type; + uint8_t child_count; + TSSymbol symbol; + int16_t dynamic_precedence; + uint16_t production_id; + } reduce; + uint8_t type; +} TSParseAction; + +typedef struct { + uint16_t lex_state; + uint16_t external_lex_state; +} TSLexMode; + +typedef union { + TSParseAction action; + struct { + uint8_t count; + bool reusable; + } entry; +} TSParseActionEntry; + +struct TSLanguage { + uint32_t version; + uint32_t symbol_count; + uint32_t alias_count; + uint32_t token_count; + uint32_t external_token_count; + uint32_t state_count; + uint32_t large_state_count; + uint32_t production_id_count; + uint32_t field_count; + uint16_t max_alias_sequence_length; + const uint16_t *parse_table; + const uint16_t *small_parse_table; + const uint32_t *small_parse_table_map; + const TSParseActionEntry *parse_actions; + const char * const *symbol_names; + const char * const *field_names; + const TSFieldMapSlice *field_map_slices; + const TSFieldMapEntry *field_map_entries; + const TSSymbolMetadata *symbol_metadata; + const TSSymbol *public_symbol_map; + const uint16_t *alias_map; + const TSSymbol *alias_sequences; + const TSLexMode *lex_modes; + bool (*lex_fn)(TSLexer *, TSStateId); + bool (*keyword_lex_fn)(TSLexer *, TSStateId); + TSSymbol keyword_capture_token; + struct { + const bool *states; + const TSSymbol *symbol_map; + void *(*create)(void); + void (*destroy)(void *); + bool (*scan)(void *, TSLexer *, const bool *symbol_whitelist); + unsigned (*serialize)(void *, char *); + void (*deserialize)(void *, const char *, unsigned); + } external_scanner; + const TSStateId *primary_state_ids; +}; + +/* + * Lexer Macros + */ + +#ifdef _MSC_VER +#define UNUSED __pragma(warning(suppress : 4101)) +#else +#define UNUSED __attribute__((unused)) +#endif + +#define START_LEXER() \ + bool result = false; \ + bool skip = false; \ + UNUSED \ + bool eof = false; \ + int32_t lookahead; \ + goto start; \ + next_state: \ + lexer->advance(lexer, skip); \ + start: \ + skip = false; \ + lookahead = lexer->lookahead; + +#define ADVANCE(state_value) \ + { \ + state = state_value; \ + goto next_state; \ + } + +#define SKIP(state_value) \ + { \ + skip = true; \ + state = state_value; \ + goto next_state; \ + } + +#define ACCEPT_TOKEN(symbol_value) \ + result = true; \ + lexer->result_symbol = symbol_value; \ + lexer->mark_end(lexer); + +#define END_STATE() return result; + +/* + * Parse Table Macros + */ + +#define SMALL_STATE(id) ((id) - LARGE_STATE_COUNT) + +#define STATE(id) id + +#define ACTIONS(id) id + +#define SHIFT(state_value) \ + {{ \ + .shift = { \ + .type = TSParseActionTypeShift, \ + .state = (state_value) \ + } \ + }} + +#define SHIFT_REPEAT(state_value) \ + {{ \ + .shift = { \ + .type = TSParseActionTypeShift, \ + .state = (state_value), \ + .repetition = true \ + } \ + }} + +#define SHIFT_EXTRA() \ + {{ \ + .shift = { \ + .type = TSParseActionTypeShift, \ + .extra = true \ + } \ + }} + +#define REDUCE(symbol_val, child_count_val, ...) \ + {{ \ + .reduce = { \ + .type = TSParseActionTypeReduce, \ + .symbol = symbol_val, \ + .child_count = child_count_val, \ + __VA_ARGS__ \ + }, \ + }} + +#define RECOVER() \ + {{ \ + .type = TSParseActionTypeRecover \ + }} + +#define ACCEPT_INPUT() \ + {{ \ + .type = TSParseActionTypeAccept \ + }} + +#ifdef __cplusplus +} +#endif + +#endif // TREE_SITTER_PARSER_H_ diff --git a/test-grammars/rust/LICENSE b/test-grammars/rust/LICENSE new file mode 100644 index 0000000..ceaf3c9 --- /dev/null +++ b/test-grammars/rust/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2017 Maxim Sokolov + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/test-grammars/rust/highlights.scm b/test-grammars/rust/highlights.scm new file mode 100644 index 0000000..52c13dd --- /dev/null +++ b/test-grammars/rust/highlights.scm @@ -0,0 +1,479 @@ +; ------- +; Basic identifiers +; ------- + +; We do not style ? as an operator on purpose as it allows styling ? differently, as many highlighters do. @operator.special might have been a better scope, but @special is already documented so the change would break themes (including the intent of the default theme) +"?" @special + +(type_identifier) @type +(identifier) @variable +(field_identifier) @variable.other.member + +; ------- +; Operators +; ------- + +[ + "*" + "'" + "->" + "=>" + "<=" + "=" + "==" + "!" + "!=" + "%" + "%=" + "&" + "&=" + "&&" + "|" + "|=" + "||" + "^" + "^=" + "*" + "*=" + "-" + "-=" + "+" + "+=" + "/" + "/=" + ">" + "<" + ">=" + ">>" + "<<" + ">>=" + "<<=" + "@" + ".." + "..=" + "'" +] @operator + +; ------- +; Paths +; ------- + +(use_declaration + argument: (identifier) @namespace) +(use_wildcard + (identifier) @namespace) +(extern_crate_declaration + name: (identifier) @namespace + alias: (identifier)? @namespace) +(mod_item + name: (identifier) @namespace) +(scoped_use_list + path: (identifier)? @namespace) +(use_list + (identifier) @namespace) +(use_as_clause + path: (identifier)? @namespace + alias: (identifier) @namespace) + +; ------- +; Types +; ------- + +(type_parameters + (type_identifier) @type.parameter) +(constrained_type_parameter + left: (type_identifier) @type.parameter) +(optional_type_parameter + name: (type_identifier) @type.parameter) +((type_arguments (type_identifier) @constant) + (#match? @constant "^[A-Z_]+$")) +(type_arguments (type_identifier) @type) +(tuple_struct_pattern "_" @comment.unused) +((type_arguments (type_identifier) @comment.unused) + (#eq? @comment.unused "_")) + +; --- +; Primitives +; --- + +(escape_sequence) @constant.character.escape +(primitive_type) @type.builtin +(boolean_literal) @constant.builtin.boolean +(integer_literal) @constant.numeric.integer +(float_literal) @constant.numeric.float +(char_literal) @constant.character +[ + (string_literal) + (raw_string_literal) +] @string +(outer_doc_comment_marker "/" @comment) +(inner_doc_comment_marker "!" @comment) +[ + (line_comment) + (block_comment) +] @comment + +; --- +; Extraneous +; --- + +(self) @variable.builtin + +(field_initializer + (field_identifier) @variable.other.member) +(shorthand_field_initializer + (identifier) @variable.other.member) +(shorthand_field_identifier) @variable.other.member + +(lifetime + "'" @label + (identifier) @label) +(label + "'" @label + (identifier) @label) + +; --- +; Punctuation +; --- + +[ + "::" + "." + ";" + "," + ":" +] @punctuation.delimiter + +[ + "(" + ")" + "[" + "]" + "{" + "}" + "#" +] @punctuation.bracket +(type_arguments + [ + "<" + ">" + ] @punctuation.bracket) +(type_parameters + [ + "<" + ">" + ] @punctuation.bracket) +(for_lifetimes ["<" ">"] @punctuation.bracket) +(closure_parameters + "|" @punctuation.bracket) +(bracketed_type ["<" ">"] @punctuation.bracket) + +; --- +; Variables +; --- + +(let_declaration + pattern: [ + ((identifier) @variable) + ((tuple_pattern + (identifier) @variable)) + ]) + +; It needs to be anonymous to not conflict with `call_expression` further below. +(_ + value: (field_expression + value: (identifier)? @variable + field: (field_identifier) @variable.other.member)) + +(parameter + pattern: (identifier) @variable.parameter) +(closure_parameters + (identifier) @variable.parameter) + +; ------- +; Keywords +; ------- + +(for_expression + "for" @keyword.control.repeat) +(gen_block "gen" @keyword.control) + +"in" @keyword.control + +[ + "match" + "if" + "else" + "try" +] @keyword.control.conditional + +[ + "while" + "loop" +] @keyword.control.repeat + +[ + "break" + "continue" + "return" + "await" + "yield" +] @keyword.control.return + +"use" @keyword.control.import +(mod_item "mod" @keyword.control.import !body) +(use_as_clause "as" @keyword.control.import) + +(type_cast_expression "as" @keyword.operator) + +((generic_type + type: (type_identifier) @keyword) + (#eq? @keyword "use")) + +[ + (crate) + (super) + "as" + "pub" + "mod" + "extern" + + "impl" + "where" + "trait" + "for" + + "default" + "async" +] @keyword + +[ + "struct" + "enum" + "union" + "type" +] @keyword.storage.type + +"let" @keyword.storage +"fn" @keyword.function +"unsafe" @keyword.special +"macro_rules!" @function.macro + +(mutable_specifier) @keyword.storage.modifier.mut + +(reference_type "&" @keyword.storage.modifier.ref) +(self_parameter "&" @keyword.storage.modifier.ref) + +[ + "static" + "const" + "raw" + "ref" + "move" + "dyn" +] @keyword.storage.modifier + +; TODO: variable.mut to highlight mutable identifiers via locals.scm + +; --- +; Remaining Paths +; --- + +(scoped_identifier + path: (identifier)? @namespace + name: (identifier) @namespace) +(scoped_type_identifier + path: (identifier) @namespace) + +; ------- +; Functions +; ------- + +(call_expression + function: [ + ((identifier) @function) + (scoped_identifier + name: (identifier) @function) + (field_expression + field: (field_identifier) @function) + ]) +(generic_function + function: [ + ((identifier) @function) + (scoped_identifier + name: (identifier) @function) + (field_expression + field: (field_identifier) @function.method) + ]) + +(function_item + name: (identifier) @function) + +(function_signature_item + name: (identifier) @function) + +; ------- +; Guess Other Types +; ------- +; Other PascalCase identifiers are assumed to be structs. + +((identifier) @type + (#match? @type "^[A-Z]")) + +(never_type "!" @type) + +((identifier) @constant + (#match? @constant "^[A-Z][A-Z\\d_]*$")) + +; --- +; PascalCase identifiers in call_expressions (e.g. `Ok()`) +; are assumed to be enum constructors. +; --- + +(call_expression + function: [ + ((identifier) @constructor + (#match? @constructor "^[A-Z]")) + (scoped_identifier + name: ((identifier) @constructor + (#match? @constructor "^[A-Z]"))) + ]) + +; --- +; PascalCase identifiers under a path which is also PascalCase +; are assumed to be constructors if they have methods or fields. +; --- + +(field_expression + value: (scoped_identifier + path: [ + (identifier) @type + (scoped_identifier + name: (identifier) @type) + ] + name: (identifier) @constructor + (#match? @type "^[A-Z]") + (#match? @constructor "^[A-Z]"))) + +(enum_variant (identifier) @type.enum.variant) + + +; ------- +; Constructors +; ------- +; TODO: this is largely guesswork, remove it once we get actual info from locals.scm or r-a + +(struct_expression + name: (type_identifier) @constructor) + +(tuple_struct_pattern + type: [ + (identifier) @constructor + (scoped_identifier + name: (identifier) @constructor) + ]) +(struct_pattern + type: [ + ((type_identifier) @constructor) + (scoped_type_identifier + name: (type_identifier) @constructor) + ]) +(match_pattern + ((identifier) @constructor) (#match? @constructor "^[A-Z]")) +(or_pattern + ((identifier) @constructor) + ((identifier) @constructor) + (#match? @constructor "^[A-Z]")) + +; --- +; Macros +; --- + +(attribute + (identifier) @function.macro) +(inner_attribute_item "!" @punctuation) +(attribute + [ + (identifier) @function.macro + (scoped_identifier + name: (identifier) @function.macro) + ] + (token_tree (identifier) @function.macro)?) + +(inner_attribute_item) @attribute + +(macro_definition + name: (identifier) @function.macro) +(macro_invocation + macro: [ + ((identifier) @function.macro) + (scoped_identifier + name: (identifier) @function.macro) + ] + "!" @function.macro) + +(metavariable) @variable.parameter +(fragment_specifier) @type + +(attribute + (identifier) @special + arguments: (token_tree (identifier) @type) + (#eq? @special "derive") +) + +; --- +; Prelude +; --- + +((identifier) @type.enum.variant.builtin + (#any-of? @type.enum.variant.builtin "Some" "None" "Ok" "Err")) + + +(call_expression + (identifier) @function.builtin + (#any-of? @function.builtin + "drop" + "size_of" + "size_of_val" + "align_of" + "align_of_val")) + +((type_identifier) @type.builtin + (#any-of? + @type.builtin + "Send" + "Sized" + "Sync" + "Unpin" + "Drop" + "Fn" + "FnMut" + "FnOnce" + "AsMut" + "AsRef" + "From" + "Into" + "DoubleEndedIterator" + "ExactSizeIterator" + "Extend" + "IntoIterator" + "Iterator" + "Option" + "Result" + "Clone" + "Copy" + "Debug" + "Default" + "Eq" + "Hash" + "Ord" + "PartialEq" + "PartialOrd" + "ToOwned" + "Box" + "String" + "ToString" + "Vec" + "FromIterator" + "TryFrom" + "TryInto")) diff --git a/test-grammars/rust/injections.scm b/test-grammars/rust/injections.scm new file mode 100644 index 0000000..06c4d13 --- /dev/null +++ b/test-grammars/rust/injections.scm @@ -0,0 +1,81 @@ +([(line_comment !doc) (block_comment !doc)] @injection.content + (#set! injection.language "comment")) + +((doc_comment) @injection.content + (#set! injection.language "markdown") + (#set! injection.combined)) + +((macro_invocation + macro: + [ + (scoped_identifier + name: (_) @_macro_name) + (identifier) @_macro_name + ] + (token_tree) @injection.content) + (#eq? @_macro_name "html") + (#set! injection.language "html") + (#set! injection.include-children)) + +((macro_invocation + macro: + [ + (scoped_identifier + name: (_) @_macro_name) + (identifier) @_macro_name + ] + (token_tree) @injection.content) + (#eq? @_macro_name "slint") + (#set! injection.language "slint") + (#set! injection.include-children)) + +((macro_invocation + (token_tree) @injection.content) + (#set! injection.language "rust") + (#set! injection.include-children)) + +((macro_rule + (token_tree) @injection.content) + (#set! injection.language "rust") + (#set! injection.include-children)) + +(call_expression + function: (scoped_identifier + path: (identifier) @_regex (#eq? @_regex "Regex") + name: (identifier) @_new (#eq? @_new "new")) + arguments: (arguments (raw_string_literal) @injection.content) + (#set! injection.language "regex")) + +(call_expression + function: (scoped_identifier + path: (scoped_identifier (identifier) @_regex (#eq? @_regex "Regex") .) + name: (identifier) @_new (#eq? @_new "new")) + arguments: (arguments (raw_string_literal) @injection.content) + (#set! injection.language "regex")) + +; Highlight SQL in `sqlx::query!()`, `sqlx::query_scalar!()`, and `sqlx::query_scalar_unchecked!()` +(macro_invocation + macro: (scoped_identifier + path: (identifier) @_sqlx (#eq? @_sqlx "sqlx") + name: (identifier) @_query (#match? @_query "^query(_scalar|_scalar_unchecked)?$")) + (token_tree + ; Only the first argument is SQL + . + [(string_literal) (raw_string_literal)] @injection.content + ) + (#set! injection.language "sql")) + +; Highlight SQL in `sqlx::query_as!()` and `sqlx::query_as_unchecked!()` +(macro_invocation + macro: (scoped_identifier + path: (identifier) @_sqlx (#eq? @_sqlx "sqlx") + name: (identifier) @_query_as (#match? @_query_as "^query_as(_unchecked)?$")) + (token_tree + ; Only the second argument is SQL + . + ; Allow anything as the first argument in case the user has lower case type + ; names for some reason + (_) + [(string_literal) (raw_string_literal)] @injection.content + ) + (#set! injection.language "sql")) diff --git a/test-grammars/rust/metadata.json b/test-grammars/rust/metadata.json new file mode 100644 index 0000000..b4b896b --- /dev/null +++ b/test-grammars/rust/metadata.json @@ -0,0 +1,6 @@ +{ + "repo": "https://github.com/tree-sitter/tree-sitter-rust", + "rev": "1f63b33efee17e833e0ea29266dd3d713e27e321", + "license": "MIT", + "compressed": true +} \ No newline at end of file diff --git a/test-grammars/rust/src/grammar.json b/test-grammars/rust/src/grammar.json new file mode 100644 index 0000000..42726eb Binary files /dev/null and b/test-grammars/rust/src/grammar.json differ diff --git a/test-grammars/rust/src/parser.c b/test-grammars/rust/src/parser.c new file mode 100644 index 0000000..4acbf46 Binary files /dev/null and b/test-grammars/rust/src/parser.c differ diff --git a/test-grammars/rust/src/scanner.c b/test-grammars/rust/src/scanner.c new file mode 100644 index 0000000..269f6b2 --- /dev/null +++ b/test-grammars/rust/src/scanner.c @@ -0,0 +1,393 @@ +#include "tree_sitter/alloc.h" +#include "tree_sitter/parser.h" + +#include + +enum TokenType { + STRING_CONTENT, + RAW_STRING_LITERAL_START, + RAW_STRING_LITERAL_CONTENT, + RAW_STRING_LITERAL_END, + FLOAT_LITERAL, + BLOCK_OUTER_DOC_MARKER, + BLOCK_INNER_DOC_MARKER, + BLOCK_COMMENT_CONTENT, + LINE_DOC_CONTENT, + ERROR_SENTINEL +}; + +typedef struct { + uint8_t opening_hash_count; +} Scanner; + +void *tree_sitter_rust_external_scanner_create() { return ts_calloc(1, sizeof(Scanner)); } + +void tree_sitter_rust_external_scanner_destroy(void *payload) { ts_free((Scanner *)payload); } + +unsigned tree_sitter_rust_external_scanner_serialize(void *payload, char *buffer) { + Scanner *scanner = (Scanner *)payload; + buffer[0] = (char)scanner->opening_hash_count; + return 1; +} + +void tree_sitter_rust_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) { + Scanner *scanner = (Scanner *)payload; + scanner->opening_hash_count = 0; + if (length == 1) { + Scanner *scanner = (Scanner *)payload; + scanner->opening_hash_count = buffer[0]; + } +} + +static inline bool is_num_char(int32_t c) { return c == '_' || iswdigit(c); } + +static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); } + +static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); } + +static inline bool process_string(TSLexer *lexer) { + bool has_content = false; + for (;;) { + if (lexer->lookahead == '\"' || lexer->lookahead == '\\') { + break; + } + if (lexer->eof(lexer)) { + return false; + } + has_content = true; + advance(lexer); + } + lexer->result_symbol = STRING_CONTENT; + lexer->mark_end(lexer); + return has_content; +} + +static inline bool scan_raw_string_start(Scanner *scanner, TSLexer *lexer) { + if (lexer->lookahead == 'b' || lexer->lookahead == 'c') { + advance(lexer); + } + if (lexer->lookahead != 'r') { + return false; + } + advance(lexer); + + uint8_t opening_hash_count = 0; + while (lexer->lookahead == '#') { + advance(lexer); + opening_hash_count++; + } + + if (lexer->lookahead != '"') { + return false; + } + advance(lexer); + scanner->opening_hash_count = opening_hash_count; + + lexer->result_symbol = RAW_STRING_LITERAL_START; + return true; +} + +static inline bool scan_raw_string_content(Scanner *scanner, TSLexer *lexer) { + for (;;) { + if (lexer->eof(lexer)) { + return false; + } + if (lexer->lookahead == '"') { + lexer->mark_end(lexer); + advance(lexer); + unsigned hash_count = 0; + while (lexer->lookahead == '#' && hash_count < scanner->opening_hash_count) { + advance(lexer); + hash_count++; + } + if (hash_count == scanner->opening_hash_count) { + lexer->result_symbol = RAW_STRING_LITERAL_CONTENT; + return true; + } + } else { + advance(lexer); + } + } +} + +static inline bool scan_raw_string_end(Scanner *scanner, TSLexer *lexer) { + advance(lexer); + for (unsigned i = 0; i < scanner->opening_hash_count; i++) { + advance(lexer); + } + lexer->result_symbol = RAW_STRING_LITERAL_END; + return true; +} + +static inline bool process_float_literal(TSLexer *lexer) { + lexer->result_symbol = FLOAT_LITERAL; + + advance(lexer); + while (is_num_char(lexer->lookahead)) { + advance(lexer); + } + + bool has_fraction = false, has_exponent = false; + + if (lexer->lookahead == '.') { + has_fraction = true; + advance(lexer); + if (iswalpha(lexer->lookahead)) { + // The dot is followed by a letter: 1.max(2) => not a float but an integer + return false; + } + + if (lexer->lookahead == '.') { + return false; + } + while (is_num_char(lexer->lookahead)) { + advance(lexer); + } + } + + lexer->mark_end(lexer); + + if (lexer->lookahead == 'e' || lexer->lookahead == 'E') { + has_exponent = true; + advance(lexer); + if (lexer->lookahead == '+' || lexer->lookahead == '-') { + advance(lexer); + } + if (!is_num_char(lexer->lookahead)) { + return true; + } + advance(lexer); + while (is_num_char(lexer->lookahead)) { + advance(lexer); + } + + lexer->mark_end(lexer); + } + + if (!has_exponent && !has_fraction) { + return false; + } + + if (lexer->lookahead != 'u' && lexer->lookahead != 'i' && lexer->lookahead != 'f') { + return true; + } + advance(lexer); + if (!iswdigit(lexer->lookahead)) { + return true; + } + + while (iswdigit(lexer->lookahead)) { + advance(lexer); + } + + lexer->mark_end(lexer); + return true; +} + +static inline bool process_line_doc_content(TSLexer *lexer) { + lexer->result_symbol = LINE_DOC_CONTENT; + for (;;) { + if (lexer->eof(lexer)) { + return true; + } + if (lexer->lookahead == '\n') { + // Include the newline in the doc content node. + // Line endings are useful for markdown injection. + advance(lexer); + return true; + } + advance(lexer); + } +} + +typedef enum { + LeftForwardSlash, + LeftAsterisk, + Continuing, +} BlockCommentState; + +typedef struct { + BlockCommentState state; + unsigned nestingDepth; +} BlockCommentProcessing; + +static inline void process_left_forward_slash(BlockCommentProcessing *processing, char current) { + if (current == '*') { + processing->nestingDepth += 1; + } + processing->state = Continuing; +}; + +static inline void process_left_asterisk(BlockCommentProcessing *processing, char current, TSLexer *lexer) { + if (current == '*') { + lexer->mark_end(lexer); + processing->state = LeftAsterisk; + return; + } + + if (current == '/') { + processing->nestingDepth -= 1; + } + + processing->state = Continuing; +} + +static inline void process_continuing(BlockCommentProcessing *processing, char current) { + switch (current) { + case '/': + processing->state = LeftForwardSlash; + break; + case '*': + processing->state = LeftAsterisk; + break; + } +} + +static inline bool process_block_comment(TSLexer *lexer, const bool *valid_symbols) { + char first = (char)lexer->lookahead; + // The first character is stored so we can safely advance inside + // these if blocks. However, because we only store one, we can only + // safely advance 1 time. Since there's a chance that an advance could + // happen in one state, we must advance in all states to ensure that + // the program ends up in a sane state prior to processing the block + // comment if need be. + if (valid_symbols[BLOCK_INNER_DOC_MARKER] && first == '!') { + lexer->result_symbol = BLOCK_INNER_DOC_MARKER; + advance(lexer); + return true; + } + if (valid_symbols[BLOCK_OUTER_DOC_MARKER] && first == '*') { + advance(lexer); + lexer->mark_end(lexer); + // If the next token is a / that means that it's an empty block comment. + if (lexer->lookahead == '/') { + return false; + } + // If the next token is a * that means that this isn't a BLOCK_OUTER_DOC_MARKER + // as BLOCK_OUTER_DOC_MARKER's only have 2 * not 3 or more. + if (lexer->lookahead != '*') { + lexer->result_symbol = BLOCK_OUTER_DOC_MARKER; + return true; + } + } else { + advance(lexer); + } + + if (valid_symbols[BLOCK_COMMENT_CONTENT]) { + BlockCommentProcessing processing = {Continuing, 1}; + // Manually set the current state based on the first character + switch (first) { + case '*': + processing.state = LeftAsterisk; + if (lexer->lookahead == '/') { + // This case can happen in an empty doc block comment + // like /*!*/. The comment has no contents, so bail. + return false; + } + break; + case '/': + processing.state = LeftForwardSlash; + break; + default: + processing.state = Continuing; + break; + } + + // For the purposes of actually parsing rust code, this + // is incorrect as it considers an unterminated block comment + // to be an error. However, for the purposes of syntax highlighting + // this should be considered successful as otherwise you are not able + // to syntax highlight a block of code prior to closing the + // block comment + while (!lexer->eof(lexer) && processing.nestingDepth != 0) { + // Set first to the current lookahead as that is the second character + // as we force an advance in the above code when we are checking if we + // need to handle a block comment inner or outer doc comment signifier + // node + first = (char)lexer->lookahead; + switch (processing.state) { + case LeftForwardSlash: + process_left_forward_slash(&processing, first); + break; + case LeftAsterisk: + process_left_asterisk(&processing, first, lexer); + break; + case Continuing: + lexer->mark_end(lexer); + process_continuing(&processing, first); + break; + default: + break; + } + advance(lexer); + if (first == '/' && processing.nestingDepth != 0) { + lexer->mark_end(lexer); + } + } + lexer->result_symbol = BLOCK_COMMENT_CONTENT; + return true; + } + + return false; +} + +bool tree_sitter_rust_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) { + // The documentation states that if the lexical analysis fails for some reason + // they will mark every state as valid and pass it to the external scanner + // However, we can't do anything to help them recover in that case so we + // should just fail. + /* + link: https://tree-sitter.github.io/tree-sitter/creating-parsers#external-scanners + If a syntax error is encountered during regular parsing, Tree-sitter’s + first action during error recovery will be to call the external scanner’s + scan function with all tokens marked valid. The scanner should detect this + case and handle it appropriately. One simple method of detection is to add + an unused token to the end of the externals array, for example + + externals: $ => [$.token1, $.token2, $.error_sentinel], + + then check whether that token is marked valid to determine whether + Tree-sitter is in error correction mode. + */ + if (valid_symbols[ERROR_SENTINEL]) { + return false; + } + + Scanner *scanner = (Scanner *)payload; + + if (valid_symbols[BLOCK_COMMENT_CONTENT] || valid_symbols[BLOCK_INNER_DOC_MARKER] || + valid_symbols[BLOCK_OUTER_DOC_MARKER]) { + return process_block_comment(lexer, valid_symbols); + } + + if (valid_symbols[STRING_CONTENT] && !valid_symbols[FLOAT_LITERAL]) { + return process_string(lexer); + } + + if (valid_symbols[LINE_DOC_CONTENT]) { + return process_line_doc_content(lexer); + } + + while (iswspace(lexer->lookahead)) { + skip(lexer); + } + + if (valid_symbols[RAW_STRING_LITERAL_START] && + (lexer->lookahead == 'r' || lexer->lookahead == 'b' || lexer->lookahead == 'c')) { + return scan_raw_string_start(scanner, lexer); + } + + if (valid_symbols[RAW_STRING_LITERAL_CONTENT]) { + return scan_raw_string_content(scanner, lexer); + } + + if (valid_symbols[RAW_STRING_LITERAL_END] && lexer->lookahead == '"') { + return scan_raw_string_end(scanner, lexer); + } + + if (valid_symbols[FLOAT_LITERAL] && iswdigit(lexer->lookahead)) { + return process_float_literal(lexer); + } + + return false; +} diff --git a/test-grammars/rust/src/tree_sitter/alloc.h b/test-grammars/rust/src/tree_sitter/alloc.h new file mode 100644 index 0000000..1abdd12 --- /dev/null +++ b/test-grammars/rust/src/tree_sitter/alloc.h @@ -0,0 +1,54 @@ +#ifndef TREE_SITTER_ALLOC_H_ +#define TREE_SITTER_ALLOC_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +// Allow clients to override allocation functions +#ifdef TREE_SITTER_REUSE_ALLOCATOR + +extern void *(*ts_current_malloc)(size_t size); +extern void *(*ts_current_calloc)(size_t count, size_t size); +extern void *(*ts_current_realloc)(void *ptr, size_t size); +extern void (*ts_current_free)(void *ptr); + +#ifndef ts_malloc +#define ts_malloc ts_current_malloc +#endif +#ifndef ts_calloc +#define ts_calloc ts_current_calloc +#endif +#ifndef ts_realloc +#define ts_realloc ts_current_realloc +#endif +#ifndef ts_free +#define ts_free ts_current_free +#endif + +#else + +#ifndef ts_malloc +#define ts_malloc malloc +#endif +#ifndef ts_calloc +#define ts_calloc calloc +#endif +#ifndef ts_realloc +#define ts_realloc realloc +#endif +#ifndef ts_free +#define ts_free free +#endif + +#endif + +#ifdef __cplusplus +} +#endif + +#endif // TREE_SITTER_ALLOC_H_ diff --git a/test-grammars/rust/src/tree_sitter/array.h b/test-grammars/rust/src/tree_sitter/array.h new file mode 100644 index 0000000..15a3b23 --- /dev/null +++ b/test-grammars/rust/src/tree_sitter/array.h @@ -0,0 +1,290 @@ +#ifndef TREE_SITTER_ARRAY_H_ +#define TREE_SITTER_ARRAY_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "./alloc.h" + +#include +#include +#include +#include +#include + +#ifdef _MSC_VER +#pragma warning(disable : 4101) +#elif defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-variable" +#endif + +#define Array(T) \ + struct { \ + T *contents; \ + uint32_t size; \ + uint32_t capacity; \ + } + +/// Initialize an array. +#define array_init(self) \ + ((self)->size = 0, (self)->capacity = 0, (self)->contents = NULL) + +/// Create an empty array. +#define array_new() \ + { NULL, 0, 0 } + +/// Get a pointer to the element at a given `index` in the array. +#define array_get(self, _index) \ + (assert((uint32_t)(_index) < (self)->size), &(self)->contents[_index]) + +/// Get a pointer to the first element in the array. +#define array_front(self) array_get(self, 0) + +/// Get a pointer to the last element in the array. +#define array_back(self) array_get(self, (self)->size - 1) + +/// Clear the array, setting its size to zero. Note that this does not free any +/// memory allocated for the array's contents. +#define array_clear(self) ((self)->size = 0) + +/// Reserve `new_capacity` elements of space in the array. If `new_capacity` is +/// less than the array's current capacity, this function has no effect. +#define array_reserve(self, new_capacity) \ + _array__reserve((Array *)(self), array_elem_size(self), new_capacity) + +/// Free any memory allocated for this array. Note that this does not free any +/// memory allocated for the array's contents. +#define array_delete(self) _array__delete((Array *)(self)) + +/// Push a new `element` onto the end of the array. +#define array_push(self, element) \ + (_array__grow((Array *)(self), 1, array_elem_size(self)), \ + (self)->contents[(self)->size++] = (element)) + +/// Increase the array's size by `count` elements. +/// New elements are zero-initialized. +#define array_grow_by(self, count) \ + do { \ + if ((count) == 0) break; \ + _array__grow((Array *)(self), count, array_elem_size(self)); \ + memset((self)->contents + (self)->size, 0, (count) * array_elem_size(self)); \ + (self)->size += (count); \ + } while (0) + +/// Append all elements from one array to the end of another. +#define array_push_all(self, other) \ + array_extend((self), (other)->size, (other)->contents) + +/// Append `count` elements to the end of the array, reading their values from the +/// `contents` pointer. +#define array_extend(self, count, contents) \ + _array__splice( \ + (Array *)(self), array_elem_size(self), (self)->size, \ + 0, count, contents \ + ) + +/// Remove `old_count` elements from the array starting at the given `index`. At +/// the same index, insert `new_count` new elements, reading their values from the +/// `new_contents` pointer. +#define array_splice(self, _index, old_count, new_count, new_contents) \ + _array__splice( \ + (Array *)(self), array_elem_size(self), _index, \ + old_count, new_count, new_contents \ + ) + +/// Insert one `element` into the array at the given `index`. +#define array_insert(self, _index, element) \ + _array__splice((Array *)(self), array_elem_size(self), _index, 0, 1, &(element)) + +/// Remove one element from the array at the given `index`. +#define array_erase(self, _index) \ + _array__erase((Array *)(self), array_elem_size(self), _index) + +/// Pop the last element off the array, returning the element by value. +#define array_pop(self) ((self)->contents[--(self)->size]) + +/// Assign the contents of one array to another, reallocating if necessary. +#define array_assign(self, other) \ + _array__assign((Array *)(self), (const Array *)(other), array_elem_size(self)) + +/// Swap one array with another +#define array_swap(self, other) \ + _array__swap((Array *)(self), (Array *)(other)) + +/// Get the size of the array contents +#define array_elem_size(self) (sizeof *(self)->contents) + +/// Search a sorted array for a given `needle` value, using the given `compare` +/// callback to determine the order. +/// +/// If an existing element is found to be equal to `needle`, then the `index` +/// out-parameter is set to the existing value's index, and the `exists` +/// out-parameter is set to true. Otherwise, `index` is set to an index where +/// `needle` should be inserted in order to preserve the sorting, and `exists` +/// is set to false. +#define array_search_sorted_with(self, compare, needle, _index, _exists) \ + _array__search_sorted(self, 0, compare, , needle, _index, _exists) + +/// Search a sorted array for a given `needle` value, using integer comparisons +/// of a given struct field (specified with a leading dot) to determine the order. +/// +/// See also `array_search_sorted_with`. +#define array_search_sorted_by(self, field, needle, _index, _exists) \ + _array__search_sorted(self, 0, _compare_int, field, needle, _index, _exists) + +/// Insert a given `value` into a sorted array, using the given `compare` +/// callback to determine the order. +#define array_insert_sorted_with(self, compare, value) \ + do { \ + unsigned _index, _exists; \ + array_search_sorted_with(self, compare, &(value), &_index, &_exists); \ + if (!_exists) array_insert(self, _index, value); \ + } while (0) + +/// Insert a given `value` into a sorted array, using integer comparisons of +/// a given struct field (specified with a leading dot) to determine the order. +/// +/// See also `array_search_sorted_by`. +#define array_insert_sorted_by(self, field, value) \ + do { \ + unsigned _index, _exists; \ + array_search_sorted_by(self, field, (value) field, &_index, &_exists); \ + if (!_exists) array_insert(self, _index, value); \ + } while (0) + +// Private + +typedef Array(void) Array; + +/// This is not what you're looking for, see `array_delete`. +static inline void _array__delete(Array *self) { + if (self->contents) { + ts_free(self->contents); + self->contents = NULL; + self->size = 0; + self->capacity = 0; + } +} + +/// This is not what you're looking for, see `array_erase`. +static inline void _array__erase(Array *self, size_t element_size, + uint32_t index) { + assert(index < self->size); + char *contents = (char *)self->contents; + memmove(contents + index * element_size, contents + (index + 1) * element_size, + (self->size - index - 1) * element_size); + self->size--; +} + +/// This is not what you're looking for, see `array_reserve`. +static inline void _array__reserve(Array *self, size_t element_size, uint32_t new_capacity) { + if (new_capacity > self->capacity) { + if (self->contents) { + self->contents = ts_realloc(self->contents, new_capacity * element_size); + } else { + self->contents = ts_malloc(new_capacity * element_size); + } + self->capacity = new_capacity; + } +} + +/// This is not what you're looking for, see `array_assign`. +static inline void _array__assign(Array *self, const Array *other, size_t element_size) { + _array__reserve(self, element_size, other->size); + self->size = other->size; + memcpy(self->contents, other->contents, self->size * element_size); +} + +/// This is not what you're looking for, see `array_swap`. +static inline void _array__swap(Array *self, Array *other) { + Array swap = *other; + *other = *self; + *self = swap; +} + +/// This is not what you're looking for, see `array_push` or `array_grow_by`. +static inline void _array__grow(Array *self, uint32_t count, size_t element_size) { + uint32_t new_size = self->size + count; + if (new_size > self->capacity) { + uint32_t new_capacity = self->capacity * 2; + if (new_capacity < 8) new_capacity = 8; + if (new_capacity < new_size) new_capacity = new_size; + _array__reserve(self, element_size, new_capacity); + } +} + +/// This is not what you're looking for, see `array_splice`. +static inline void _array__splice(Array *self, size_t element_size, + uint32_t index, uint32_t old_count, + uint32_t new_count, const void *elements) { + uint32_t new_size = self->size + new_count - old_count; + uint32_t old_end = index + old_count; + uint32_t new_end = index + new_count; + assert(old_end <= self->size); + + _array__reserve(self, element_size, new_size); + + char *contents = (char *)self->contents; + if (self->size > old_end) { + memmove( + contents + new_end * element_size, + contents + old_end * element_size, + (self->size - old_end) * element_size + ); + } + if (new_count > 0) { + if (elements) { + memcpy( + (contents + index * element_size), + elements, + new_count * element_size + ); + } else { + memset( + (contents + index * element_size), + 0, + new_count * element_size + ); + } + } + self->size += new_count - old_count; +} + +/// A binary search routine, based on Rust's `std::slice::binary_search_by`. +/// This is not what you're looking for, see `array_search_sorted_with` or `array_search_sorted_by`. +#define _array__search_sorted(self, start, compare, suffix, needle, _index, _exists) \ + do { \ + *(_index) = start; \ + *(_exists) = false; \ + uint32_t size = (self)->size - *(_index); \ + if (size == 0) break; \ + int comparison; \ + while (size > 1) { \ + uint32_t half_size = size / 2; \ + uint32_t mid_index = *(_index) + half_size; \ + comparison = compare(&((self)->contents[mid_index] suffix), (needle)); \ + if (comparison <= 0) *(_index) = mid_index; \ + size -= half_size; \ + } \ + comparison = compare(&((self)->contents[*(_index)] suffix), (needle)); \ + if (comparison == 0) *(_exists) = true; \ + else if (comparison < 0) *(_index) += 1; \ + } while (0) + +/// Helper macro for the `_sorted_by` routines below. This takes the left (existing) +/// parameter by reference in order to work with the generic sorting function above. +#define _compare_int(a, b) ((int)*(a) - (int)(b)) + +#ifdef _MSC_VER +#pragma warning(default : 4101) +#elif defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic pop +#endif + +#ifdef __cplusplus +} +#endif + +#endif // TREE_SITTER_ARRAY_H_ diff --git a/test-grammars/rust/src/tree_sitter/parser.h b/test-grammars/rust/src/tree_sitter/parser.h new file mode 100644 index 0000000..799f599 --- /dev/null +++ b/test-grammars/rust/src/tree_sitter/parser.h @@ -0,0 +1,266 @@ +#ifndef TREE_SITTER_PARSER_H_ +#define TREE_SITTER_PARSER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +#define ts_builtin_sym_error ((TSSymbol)-1) +#define ts_builtin_sym_end 0 +#define TREE_SITTER_SERIALIZATION_BUFFER_SIZE 1024 + +#ifndef TREE_SITTER_API_H_ +typedef uint16_t TSStateId; +typedef uint16_t TSSymbol; +typedef uint16_t TSFieldId; +typedef struct TSLanguage TSLanguage; +#endif + +typedef struct { + TSFieldId field_id; + uint8_t child_index; + bool inherited; +} TSFieldMapEntry; + +typedef struct { + uint16_t index; + uint16_t length; +} TSFieldMapSlice; + +typedef struct { + bool visible; + bool named; + bool supertype; +} TSSymbolMetadata; + +typedef struct TSLexer TSLexer; + +struct TSLexer { + int32_t lookahead; + TSSymbol result_symbol; + void (*advance)(TSLexer *, bool); + void (*mark_end)(TSLexer *); + uint32_t (*get_column)(TSLexer *); + bool (*is_at_included_range_start)(const TSLexer *); + bool (*eof)(const TSLexer *); + void (*log)(const TSLexer *, const char *, ...); +}; + +typedef enum { + TSParseActionTypeShift, + TSParseActionTypeReduce, + TSParseActionTypeAccept, + TSParseActionTypeRecover, +} TSParseActionType; + +typedef union { + struct { + uint8_t type; + TSStateId state; + bool extra; + bool repetition; + } shift; + struct { + uint8_t type; + uint8_t child_count; + TSSymbol symbol; + int16_t dynamic_precedence; + uint16_t production_id; + } reduce; + uint8_t type; +} TSParseAction; + +typedef struct { + uint16_t lex_state; + uint16_t external_lex_state; +} TSLexMode; + +typedef union { + TSParseAction action; + struct { + uint8_t count; + bool reusable; + } entry; +} TSParseActionEntry; + +typedef struct { + int32_t start; + int32_t end; +} TSCharacterRange; + +struct TSLanguage { + uint32_t version; + uint32_t symbol_count; + uint32_t alias_count; + uint32_t token_count; + uint32_t external_token_count; + uint32_t state_count; + uint32_t large_state_count; + uint32_t production_id_count; + uint32_t field_count; + uint16_t max_alias_sequence_length; + const uint16_t *parse_table; + const uint16_t *small_parse_table; + const uint32_t *small_parse_table_map; + const TSParseActionEntry *parse_actions; + const char * const *symbol_names; + const char * const *field_names; + const TSFieldMapSlice *field_map_slices; + const TSFieldMapEntry *field_map_entries; + const TSSymbolMetadata *symbol_metadata; + const TSSymbol *public_symbol_map; + const uint16_t *alias_map; + const TSSymbol *alias_sequences; + const TSLexMode *lex_modes; + bool (*lex_fn)(TSLexer *, TSStateId); + bool (*keyword_lex_fn)(TSLexer *, TSStateId); + TSSymbol keyword_capture_token; + struct { + const bool *states; + const TSSymbol *symbol_map; + void *(*create)(void); + void (*destroy)(void *); + bool (*scan)(void *, TSLexer *, const bool *symbol_whitelist); + unsigned (*serialize)(void *, char *); + void (*deserialize)(void *, const char *, unsigned); + } external_scanner; + const TSStateId *primary_state_ids; +}; + +static inline bool set_contains(TSCharacterRange *ranges, uint32_t len, int32_t lookahead) { + uint32_t index = 0; + uint32_t size = len - index; + while (size > 1) { + uint32_t half_size = size / 2; + uint32_t mid_index = index + half_size; + TSCharacterRange *range = &ranges[mid_index]; + if (lookahead >= range->start && lookahead <= range->end) { + return true; + } else if (lookahead > range->end) { + index = mid_index; + } + size -= half_size; + } + TSCharacterRange *range = &ranges[index]; + return (lookahead >= range->start && lookahead <= range->end); +} + +/* + * Lexer Macros + */ + +#ifdef _MSC_VER +#define UNUSED __pragma(warning(suppress : 4101)) +#else +#define UNUSED __attribute__((unused)) +#endif + +#define START_LEXER() \ + bool result = false; \ + bool skip = false; \ + UNUSED \ + bool eof = false; \ + int32_t lookahead; \ + goto start; \ + next_state: \ + lexer->advance(lexer, skip); \ + start: \ + skip = false; \ + lookahead = lexer->lookahead; + +#define ADVANCE(state_value) \ + { \ + state = state_value; \ + goto next_state; \ + } + +#define ADVANCE_MAP(...) \ + { \ + static const uint16_t map[] = { __VA_ARGS__ }; \ + for (uint32_t i = 0; i < sizeof(map) / sizeof(map[0]); i += 2) { \ + if (map[i] == lookahead) { \ + state = map[i + 1]; \ + goto next_state; \ + } \ + } \ + } + +#define SKIP(state_value) \ + { \ + skip = true; \ + state = state_value; \ + goto next_state; \ + } + +#define ACCEPT_TOKEN(symbol_value) \ + result = true; \ + lexer->result_symbol = symbol_value; \ + lexer->mark_end(lexer); + +#define END_STATE() return result; + +/* + * Parse Table Macros + */ + +#define SMALL_STATE(id) ((id) - LARGE_STATE_COUNT) + +#define STATE(id) id + +#define ACTIONS(id) id + +#define SHIFT(state_value) \ + {{ \ + .shift = { \ + .type = TSParseActionTypeShift, \ + .state = (state_value) \ + } \ + }} + +#define SHIFT_REPEAT(state_value) \ + {{ \ + .shift = { \ + .type = TSParseActionTypeShift, \ + .state = (state_value), \ + .repetition = true \ + } \ + }} + +#define SHIFT_EXTRA() \ + {{ \ + .shift = { \ + .type = TSParseActionTypeShift, \ + .extra = true \ + } \ + }} + +#define REDUCE(symbol_name, children, precedence, prod_id) \ + {{ \ + .reduce = { \ + .type = TSParseActionTypeReduce, \ + .symbol = symbol_name, \ + .child_count = children, \ + .dynamic_precedence = precedence, \ + .production_id = prod_id \ + }, \ + }} + +#define RECOVER() \ + {{ \ + .type = TSParseActionTypeRecover \ + }} + +#define ACCEPT_INPUT() \ + {{ \ + .type = TSParseActionTypeAccept \ + }} + +#ifdef __cplusplus +} +#endif + +#endif // TREE_SITTER_PARSER_H_