Skip to content

Commit

Permalink
Merge branch 'tree-sitter-lua'
Browse files Browse the repository at this point in the history
Add initial support for Lua (5.3 specifically).

Introduce TSTransformer class that can be used by any other tree-sitter
based parsers.

Highlight unidentified non-leaf S-nodes in red in `--dump-stree`.
  • Loading branch information
xaizek committed Mar 25, 2021
2 parents 6eecd86 + 9d793b0 commit 487e1d8
Show file tree
Hide file tree
Showing 58 changed files with 58,550 additions and 9 deletions.
16 changes: 10 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
NAME := zograscope

CXXFLAGS += -std=c++11 -Wall -Wextra -MMD -MP -Isrc/ -Ithird-party/ -DYYDEBUG
CXXFLAGS += -pthread
CFLAGS += -MMD -MP
CFLAGS += -Ithird-party/tree-sitter/include/ -Ithird-party/tree-sitter/src/
CXXFLAGS += -std=c++11 -Wall -Wextra -DYYDEBUG -pthread
CXXFLAGS += -Isrc/ -Ithird-party/ $(CFLAGS)
LDFLAGS += -g -lboost_iostreams -lboost_program_options -lboost_filesystem
LDFLAGS += -lboost_system -pthread

Expand Down Expand Up @@ -81,9 +83,10 @@ rwildcard = $(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) \

lib := $(out_dir)/lib$(NAME).a

lib_sources := $(call rwildcard, src/, *.cpp) \
$(call rwildcard, third-party/, *.cpp)
lib_sources := $(filter-out %.gen.cpp,$(lib_sources))
lib_sources_cpp := $(call rwildcard, src/, *.cpp) \
$(call rwildcard, third-party/, *.cpp)
lib_sources_cpp := $(filter-out %.gen.cpp,$(lib_sources_cpp))
lib_sources_c := $(call rwildcard, third-party/, *.c)

lib_autocpp := $(addprefix $(out_dir)/src/c/, \
c11-lexer.gen.cpp c11-parser.gen.cpp)
Expand All @@ -94,7 +97,8 @@ lib_autohpp := $(addprefix $(out_dir)/src/c/, \
lib_autohpp += $(addprefix $(out_dir)/src/make/, \
make-lexer.gen.hpp make-parser.gen.hpp)

lib_objects := $(sort $(lib_sources:%.cpp=$(out_dir)/%.o) \
lib_objects := $(sort $(lib_sources_cpp:%.cpp=$(out_dir)/%.o) \
$(lib_sources_c:%.c=$(out_dir)/%.o) \
$(lib_autocpp:%.cpp=%.o))
lib_depends := $(lib_objects:.o=.d)

Expand Down
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ of an experiment, but this situation gets better.
| C | C11 and earlier with common extensions, but without K&R syntax |
| C++ | C++14 and earlier with common extensions |
| GNU Make | Most of the syntax |
| Lua | Version 5.3 |

#### C ####

Expand Down Expand Up @@ -98,6 +99,14 @@ Note the following:
some tuning, this should happen over time (Makefiles aren't changed that
often)

#### Lua ####

Newly added (March 2021) with very little testing so far. However, the
language is small and simple enough to not pose much difficulties.

Note the following:
* non-5.3 versions might still work, albeit can produce worse results

#### Other ####

More languages should be added in the future, maybe with external parsers that
Expand Down Expand Up @@ -265,6 +274,8 @@ custom allocators.

[TinyXML2][tinyxml2] is used for parsing XML.

[tree-sitter][tree-sitter] is used for parsing of some languages.

[Catch2][catch] is used for tests.

### References ###
Expand Down Expand Up @@ -297,6 +308,7 @@ Kaizhong Zhang and Dennis Shasha.
[dtl]: https://github.com/cubicdaiya/dtl
[pmr]: https://github.com/phalpern/CppCon2017Code
[tinyxml2]: https://github.com/leethomason/tinyxml2
[tree-sitter]: https://tree-sitter.github.io/
[catch]: https://github.com/catchorg/Catch2

[cd]: http://www.merlin.uzh.ch/publication/show/2531
Expand Down
8 changes: 8 additions & 0 deletions src/Language.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include "c/C11Language.hpp"
#include "make/MakeLanguage.hpp"
#include "srcml/cxx/SrcmlCxxLanguage.hpp"
#include "ts/lua/TSLuaLanguage.hpp"
#include "tree.hpp"

namespace fs = boost::filesystem;
Expand Down Expand Up @@ -61,6 +62,9 @@ Language::create(const std::string &fileName, const std::string &l)
if (lang == "cxx" || lang == "srcml:cxx") {
return std::unique_ptr<SrcmlCxxLanguage>(new SrcmlCxxLanguage());
}
if (lang == "lua" || lang == "ts:lua") {
return std::unique_ptr<TsLuaLanguage>(new TsLuaLanguage());
}
if (lang == "make") {
return std::unique_ptr<MakeLanguage>(new MakeLanguage());
}
Expand Down Expand Up @@ -99,6 +103,10 @@ detectLanguage(const std::string &stem, const std::string &ext)
return "cxx";
}

if (ext == ".lua") {
return "lua";
}

using boost::algorithm::ends_with;
if (ends_with(stem, "makefile") || ext == ".mk" || ext == ".mak") {
return "make";
Expand Down
5 changes: 4 additions & 1 deletion src/STree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,16 @@ print(const PNode *node, const std::string &contents, Language &lang)

Decoration labelHi = 78_fg + bold;
Decoration stypeHi = 222_fg;
Decoration badStypeHi = 88_bg;

trees::print(std::cout, node, [&](std::ostream &os, const PNode *node) {
bool badSType = (node->stype == SType{} && !node->children.empty());

os << (labelHi << '`'
<< contents.substr(node->value.from, node->value.len)
<< '`')
<< ", "
<< (stypeHi << lang.toString(node->stype))
<< ((badSType ? badStypeHi : stypeHi) << lang.toString(node->stype))
<< '\n';
});
}
Expand Down
4 changes: 2 additions & 2 deletions src/tree.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@ struct Node
State state : 8;
bool satellite : 1; // Decorative element or node whose match was finalized.
bool moved : 1;
bool last : 1;
bool leaf : 1;
bool last : 1; // This is root of a tree from the last layer.
bool leaf : 1; // This node corresponds to something in the source.

Node(allocator_type al = {})
: children(al),
Expand Down
213 changes: 213 additions & 0 deletions src/ts/TSTransformer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
// Copyright (C) 2021 xaizek <[email protected]>
//
// This file is part of zograscope.
//
// zograscope is free software: you can redistribute it and/or modify
// it under the terms of version 3 of the GNU Affero General Public License as
// published by the Free Software Foundation.
//
// zograscope is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with zograscope. If not, see <http://www.gnu.org/licenses/>.

#include "TSTransformer.hpp"

#include <iostream>
#include <memory>
#include <stdexcept>
#include <string>

#include <boost/utility/string_ref.hpp>
#include "tree_sitter/api.h"

#include "TreeBuilder.hpp"
#include "types.hpp"

// XXX: hard-coded width of a tabulation character.
const int tabWidth = 4;

static void updatePosition(boost::string_ref str, int &line, int &col);
static bool isSeparator(Type type);

TSTransformer::TSTransformer(const std::string &contents,
const TSLanguage &tsLanguage,
TreeBuilder &tb,
const std::unordered_map<std::string, SType> &stypes,
const std::unordered_map<std::string, Type> &types,
bool debug)
: contents(contents), tsLanguage(tsLanguage), tb(tb), stypes(stypes),
types(types), debug(debug)
{ }

void
TSTransformer::transform()
{
std::unique_ptr<TSParser , void(*)(TSParser *)> parser(ts_parser_new(),
&ts_parser_delete);
ts_parser_set_language(parser.get(), &tsLanguage);

std::unique_ptr<TSTree, void(*)(TSTree *)> tree(
ts_parser_parse_string(parser.get(), NULL,
contents.c_str(), contents.size()),
&ts_tree_delete
);
if (tree == nullptr) {
throw std::runtime_error("Failed to build a tree");
}

position = 0;
line = 1;
col = 1;

tb.setRoot(visit(ts_tree_root_node(tree.get())));

if (debug) {
for (const std::string &type : badSTypes) {
std::cout << "(TSTransformer) No SType for: " << type << '\n';
}
for (const std::string &type : badTypes) {
std::cout << "(TSTransformer) No Type for: " << type << '\n';
}
}
}

PNode *
TSTransformer::visit(const TSNode &node)
{
SType stype = {};
const char *type = ts_node_type(node);
auto it = stypes.find(type);
if (it != stypes.end()) {
stype = it->second;
} else if (debug) {
uint32_t from = ts_node_start_byte(node);
uint32_t to = ts_node_end_byte(node);
boost::string_ref val(contents.c_str() + from, to - from);
badSTypes.insert(type + (": `" + val.to_string() + '`'));
}

PNode *pnode = tb.addNode({}, stype);

uint32_t childCount = ts_node_child_count(node);
for (uint32_t i = 0; i < childCount; ++i) {
const TSNode child = ts_node_child(node, i);
if (ts_node_child_count(child) == 0) {
SType stype = {};
auto it = stypes.find(ts_node_type(child));
if (it != stypes.end()) {
stype = it->second;
}

visitLeaf(stype, pnode, child);
} else {
tb.append(pnode, visit(child));
}
}

return pnode;
}

void
TSTransformer::visitLeaf(SType stype, PNode *pnode, const TSNode &leaf)
{
uint32_t from = ts_node_start_byte(leaf);
uint32_t to = ts_node_end_byte(leaf);

boost::string_ref skipped(contents.c_str() + position, from - position);
updatePosition(skipped, line, col);

boost::string_ref val(contents.c_str() + from, to - from);
Type type = determineType(leaf);

if (stype == SType{} && isSeparator(type)) {
stype = stypes.at("separator");
}

const std::uint32_t len = to - from;
tb.append(pnode, tb.addNode(Text{from, len, 0, 0, static_cast<int>(type)},
Location{line, col, 0, 0}, stype));

updatePosition(val, line, col);
position = to;
}

// Goes over characters in the string and updates line and column accordingly.
static void
updatePosition(boost::string_ref str, int &line, int &col)
{
while (!str.empty()) {
switch (str.front()) {
case '\n':
++line;
col = 1;
break;
case '\t':
col += tabWidth - (col - 1)%tabWidth;
break;

default:
++col;
break;
}
str.remove_prefix(1);
}
}

Type
TSTransformer::determineType(const TSNode &node)
{
const char *type = ts_node_type(node);
auto it = types.find(type);
if (it != types.cend()) {
return it->second;
}

if (debug) {
uint32_t from = ts_node_start_byte(node);
uint32_t to = ts_node_end_byte(node);
boost::string_ref val(contents.c_str() + from, to - from);
badTypes.insert(type + (": `" + val.to_string() + '`'));
}

return Type::Other;
}

// Determines whether type is a separator.
static bool
isSeparator(Type type)
{
switch (type) {
case Type::Jumps:
case Type::Types:
case Type::LeftBrackets:
case Type::RightBrackets:
case Type::Comparisons:
case Type::Operators:
case Type::LogicalOperators:
case Type::Assignments:
case Type::Keywords:
case Type::Other:
return true;

case Type::Virtual:
case Type::Functions:
case Type::UserTypes:
case Type::Identifiers:
case Type::Specifiers:
case Type::Directives:
case Type::Comments:
case Type::StrConstants:
case Type::IntConstants:
case Type::FPConstants:
case Type::CharConstants:
case Type::NonInterchangeable:
return false;
}

assert(false && "Unhandled enumeration item");
return false;
}
Loading

0 comments on commit 487e1d8

Please sign in to comment.