diff --git a/doc/Makefile b/doc/Makefile new file mode 100644 index 00000000..690e3d1c --- /dev/null +++ b/doc/Makefile @@ -0,0 +1,3 @@ + +rust.pdf: rust.texi + texi2pdf $< diff --git a/doc/rust.texi b/doc/rust.texi new file mode 100644 index 00000000..3a636b44 --- /dev/null +++ b/doc/rust.texi @@ -0,0 +1,671 @@ +\input texinfo @c -*-texinfo-*- +@c %**start of header +@setfilename rust.info +@settitle Rust Documentation +@setchapternewpage odd +@c %**end of header + +@syncodeindex fn cp + +@ifinfo +This manual is for the ``Rust'' programming language. + +Copyright 2006 Graydon Hoare + +All rights reserved +Licensed to the public under the terms of the GNU GPL (>= 2.0). +See the file COPYING for details +@end ifinfo + +@dircategory Programming +@direntry +* rust: (rust). Rust programming language +@end direntry + +@titlepage +@title Rust +@subtitle A general-purpose programming language +@author Graydon Hoare + +@page +@vskip 0pt plus 1filll +Copyright @copyright{} 2006 Graydon Hoare +All rights reserved +Licensed to the public under the terms of the GNU GPL (>= 2.0). +See the file COPYING for details +@end titlepage + +@ifnottex +@node Top +@top Top + +Rust Documentation + +Rust is a general-purpose programming language. It supports +imperative-procedural, concurrent-actor and pure-functional +styles. Rust also supports generic programming and metaprogramming, in +both static and dynamic forms. + + +@node Fundamentals +@chapter Fundamentals + + +@page +@node Design perspective +@section Design perspective + + +Rust is a ``curly-brace'' block-structured language; it generally +visually resembles the ``C'' family, but differs significantly in +syntactic and semantic details. + +Rust's semantics are a mixture of elements found in Ada, OCaml, CLU +and Hermes. + +The language design attempts to maximize the following goals: + +@itemize +@item Error detection and isolation. +@item Clarity and precision of expression. +@item Implementation simplicity. +@item Runtime efficiency. +@end itemize + +In pursuit of these goals, some common language features were +rejected. While not unique in rejecting these features -- similar +rejections are made in the ancestral languages noted above -- the +informed reader should keep in mind that such rejections are usually +@emph{intentional}. An uncommon technique does not necessariliy arise +from ignorance of the alternative. This manual may provide discussion +or footnotes for additional rationale, when notable cases arise. + +Three characteristics of Rust's design motivate most of the rejected +features. + +First, Rust occasionally strays from the standard language-design +principles of @i{minimality} and @i{orthogonality}. Rust is +unapologetically a medium-sized language; there are many languages +smaller and simpler. Some Rust constructs are only useful in +particular ``safe'' or ``efficient'' combinations. Other constructs +are conspicuously duplicated -- with important specializations for +different cases -- due to concerns for safety, clarity or speed. + +Second, while Rust can probably be classified as a ``static language'' +-- owing to the presence of a strong static type system and a certain +amount of static reasoning about storage and lifecycle -- it also +resists the habit many ``static languages'' have of throwing out all +information and language-tool functionality after compile time. Rust's +dynamic semantics are intended to support many of the useful features +of ``dynamic language'' environments such as reflective debugging, +live code generation, ``hot'' code replacement, heterogeneous +collections and latent-typed generic functions. + +Third, Rust rejects a family of features that stem from a particular +interpretation of the Object Orientation paradigm, and that have come +to be ``canonical'' in mainstream (non-academic) language +designs. Features such as code-inheritence, ubiquitous dynamic +binding, cyclical memory graphs, pointer-rich data structures, tracing +garbage collection, and shared-visibility concurrency are all avoided +in Rust. The legitimate motivations for these features are addressed +via other means. + +@page +@node Structural artifacts +@section Structural artifacts + +A collection of Rust source code can be thought of @emph{structurally} +as a set of artifacts that logically @i{contain} one another. There +are four kinds of artifacts in this view: @i{containers}, @i{modules}, +@i{values}, and @i{slots}. + +@subsection Containers + +A @dfn{container} is a unit of configuration, linkage and distribution +for Rust code. Containers logically contain modules. Containers have +``unfriendly'' names -- names that look like ugly and meaningless +numbers -- but these names are ``universally unique''. + +The source form of a container is a @dfn{container file}, which is a +text file ending in @code{.rc}. The compiled form of a container is a +binary dynamic-link library, such as an ELF or PE library (ending in +@code{.so} or @code{.dll} respectively). + +Container files contain a description of a tree of modules, and +instructions about how to compile the container. The Rust compiler +operates on container files. Container files may reference other +containers, in order to import qualified module names from them, but +the reference graph of container files is a DAG. + +@subsection Modules + +A @dfn{module} is a unit of organization within a container. The tree +of modules in a container corresponds 1:1 with a tree of directories +and files in a filesystem. Thus there are two kinds of modules: +@dfn{directory modules} and @dfn{source modules}. A directory module +contains sub-modules. A source module contains values. + +Source modules, also called @dfn{source files}, are files ending +in @code{.rs}. These files contain values. + +Modules are arranged in a human-friendly namespace within a container. +It is the responsibility of a team of programmers working on a +container to ensure that their module names to not collide, inside the +container. Module names can be reused in different containers, +however; the names in one container cannot collide with the names in +another container. + +Modules contain values. There are many types of value; the complete +list is provided in the next section. For now it is sufficient to note +that most Rust code is concerned with manipulating values. Much more +than the portion of Rust code that relates to modules and containers. + +@subsection Slots + +Finally, within values there may be @i{slots}. Values are also +@emph{referenced} by slots. A @dfn{slot} is a named memory location +that can refer to a value. Each module defines a set of slots that are +the ``top level'' slots of the module. Further slots exist in many +contexts: + +@itemize +@item Variables within functions, iterators, predicates. +@item Fields within records, alternatives and programs. +@item Components of vectors. +@end itemize + + +@page +@node Values +@section Values + +There are many kinds of value in a Rust module. + +@menu +* Primitive values:: Values supported by (some) hardware. +* Dynamic values:: Latently-typed values. +* Arithmetic values:: High-precision numbers. +* Textual values:: Strings and characters. +* Records:: Fixed products of heterogeneous types. +* Vectors:: Dynamic products of homogeneous types. +* Alternatives:: Sums of heterogeneous types. +* Functions:: Simple subroutines. +* Iterators:: Scoped coroutines. +* Predicates:: Pure functions involved in typestates. +* Programs:: General coroutines. +* Processes:: Instances of programs. +* Ports:: Inter-process message inputs. +* Channels:: Weak capabilities to send messages to ports. +* Types:: Values representing types. +* Quotations:: Values representing source code. +@end menu + +@node Primitive values +@subsection Primitive values + +@node Dynamic values +@subsection Dynamic values + +@node Arithmetic values +@subsection Arithmetic values + +@node Textual values +@subsection Textual values + +@node Records +@subsection Records + +@node Vectors +@subsection Vectors + +@node Alternatives +@subsection Alternatives + +@node Functions +@subsection Functions + +@node Iterators +@subsection Iterators + +@node Predicates +@subsection Predicates + +@node Programs +@subsection Programs + +@node Processes +@subsection Processes + +@node Ports +@subsection Ports + +@node Channels +@subsection Channels + +@node Types +@subsection Types + +@node Quotations +@subsection Quotations + + +@page +@node Type system +@section Type system + + +@node Value and slot types +@subsection Value and slot types + +Every value has a unique type. + +Every slot has a unique type. + +If a slot @var{S} references a value @var{V}, the type of @var{S} is +identical to the type of @var{V}. + + +@node Limitation +@subsection Limitation + +Some types are @i{limited}. A type @var{T} is @dfn{limited} iff any of +these conditions hold: + +@itemize +@item The type @var{T} is declared as @code{lim}. +@item The type @var{T} contains a limited slot. +@item The type @var{T} is a process type. +@end itemize + +Limited values cannot be copied or transmitted. See @ref{Memory model} +for the definitions of copying and transmission. + +A type that is a process type or declares itself limited is called +@dfn{directly limited}. A type that simply contains a limited slot +is called @dfn{indirectly limited}. + + +@node Purity +@subsection Purity + +Subroutines (functions or iterators) may be @i{pure} or @i{impure}. A +subroutine @var{S} is @dfn{impure} iff any of these conditions hold: + +@itemize +@item The subroutine @var{S} is @emph{not} declared as @code{pure} +@item The subroutine @var{S} calls any impure subroutines. +@item The subroutine @var{S} calls any channels. +@end itemize + +Purity exists as a concept in Rust for two reasons: + +@itemize +@item As a basic aid to code comprehension. Pure subroutines are easier +to reason about, so programmers may wish to ensure that portions of +their work are pure. +@item Predicates are effectively pure boolean functions. Predicates +are @emph{required} to be pure for the typestate system to function +correctly. Since purity is required for predicates, extending the +concept of purity to functions and iterators permits predicates to +call them safely. Otherwise predicates would have to be +self-contained. +@end itemize + + +@page +@node Typestates +@section Typestates + +Every block of statements containing expressions maps to a block of +primitive statements in a normal form. In this form, every statement +is logically one of: + +@itemize +@item A conditional jump statement. +@item A move or copy statement between two slots. +@item An allocation or deallocation statement. +@item Invocation of a function, iterator, or channel with a ``call message''. +@end itemize + +A @dfn{point} is defined as the boundary between two normalized +sequential statements. Every point in a normalized block has a +@i{typestate}. Two typestates are therefore defined for each +normalized statement: + +@itemize +@item The @dfn{prestate}, that preceeds the statement. +@item The @dfn{poststate}, that follows the statement. +@end itemize + +A @dfn{typestate} is formally a set of @var{N}-ary boolean predicates +over visible slots. + +Typestates form a semilattice ordered by subset-inclusion: @var{X} < +@var{Y} means @var{X} is a subset of @var{Y}, and the semilattice +``join'' operation is ``set intersection''. + +When @var{K} statements lead to a single point, the point's typestate +is the pairwise join of the @var{K} poststates of preceeding +statements. + +Predicates are declared with @code{pred}, and have names and +signatures similar to functions. The body of a predicate is +syntactically a pure boolean function. The signature of a predicate +cannot include the keyword @code{pure} or the @code{bool} return type; +these are implied and non-optional. + + +@node Asserting predicates +@subsection Asserting predicates + +The @code{assert} statement either adds the asserted predicate to its +poststate, or else fails, raising an exception. If a predicate is +declared as @code{auto}, assertions are inserted automatically some +contexts. + +Specifically: every operation (statement type) has a set of +preconditions, formulated as predicates. If a precondition names a +predicate that is not present in the statement's prestate, one of two +cases occurs: + +@itemize +@item If the missing predicates are all @code{auto} and all the +preconditions of the missing @code{auto}-predicates are met, the +compiler may insert assertions for each missing predicate. +@item Otherwise the condition is a compile-time error. + + +@node Dropping predicates +@subsection Dropping predicates + +All predicates in a typestate can be @i{dropped}. To @dfn{drop} a +predicate is to remove it from the typestate at the associated +point. For a slot @var{S}, dropping the @code{init(S)} predicate may +cause memory to be released or finalizer to be run. See @ref{Memory +model} for discussion. All other predicates can be dropped without +side-effect. + + +@node Predicates on moved values +@subsection Predicates on moved values + +When a statement moves a value from a slot @var{A} to a slot @var{B}: + +@itemize +@item Every predicate involving @var{B} is dropped. +@item Every remaining predicate involving @var{A} has @var{B} +substituted for @var{A}. +@end itemize + + +@node Predicates on copied values +@subsection Predicates on copied values + +When a statement copies a value from a slot @var{A} to a slot @var{B}: + +@itemize +@item Every predicate involving @var{B} is dropped. +@item For every remaining predicate @var{P} involving @var{A}, +a copy of @var{P} is added to the typestate with @var{B} substituted +for @var{A}. +@end itemize + + +@node Formal typestates +@subsubsection Formal typestates + +When defining a type @var{T}, a typestate @var{S} can optionally be +associated with @var{T}. When present, we say that @var{S} is the +@dfn{formal typestate} of @var{T}. When a slot has a type @var{T} with +a formal typestate @var{S}, it means that all the predicates in +@var{S} are present in the typestate of every point in which the slot +is @i{visible}. + +A slot becomes @dfn{visible} at the point immediately following the +statement containing its declaration, and remains visible through to +the end of the last statement in the block containing its declaration. + + +@node Reflection +@subsection Reflection + +Types and predicates are reflected into runtime values. Runtime values +can always be converted to type @code{dyn} or @code{lim dyn}, which +carries the type of its value (including formal typestate) along with +it. + +The runtime representation of a type can be compared to the runtime +representation of another type. + +To connect runtime and compile-time values, a type-switch statement +exists over @code{dyn} values; each arm of the type-switch +temporarily moves the @code{dyn}'s value into a typed slot. + + +@node Type system comparison with other languages +@subsection Type system comparison with other languages + +We borrow structure (but not terminology) from Ada and Hermes when +organizing the type system. In particular: + +@itemize +@item A constrained type (an Ada "subtype") is a base type plus a set of +constraints. Constraints are formulated using Hermes-style +typestate predicates rather than Ada's fixed set of constraints. +@item There is no general subtype lattice (as in OO languages). There is only +one type for each value. This is common to both languages. +@end itemize + + +@node Pragmatic notes +@subsection Pragmatic notes + +Constrained types do not participate in overload resolution, because +predicates can always be dropped. + +If you wish to produce a new type that wraps an old type, you can just +make a single-field rec. The allocation rules will not impose any +penalty for doing so. + + +@page +@node Memory model +@section Memory model + + +@node Semantic model +@subsection Semantic model + +Rust's memory model logically consists of a set of values, arranged +in a DAG. Values have 1:1 type relationship so values may be said to +be limited or not, just like types. See @ref{Limited types} for +details. + +The relationships between slots and values are the edges in the +memory DAG. + + +@node Initialization +@subsection Initialization + +Every slot is either @i{initialized} or @i{empty}. + +A slot @var{S} is @dfn{initialized} iff it refers to a value in the +memory DAG, otherwise @var{S} is @dfn{empty}. When the slot +@var{S} is initialized, the predicate @code{init(S)} is added to the +typestate. If the predicate @code{init(S)} is dropped, the slot +@var{S} becomes empty. + + +@node Sharing +@subsection Sharing + +Conceptually, every slot holds a unique subtree that is not shared. + +In real memory, some number of memory subtrees may be shared +copy-on-write (CoW). This is why we say that memory is a DAG. The +amount of sharing is at the discretion of the implementation, within +some restrictions: + +@itemize +@item Limited values cannot be shared, because they cannot be copied. +@item Interior values cannot be shared (see @ref{Allocation modes}). +@end itemize + +When a node is shared CoW, it has a reference-count and its memory is +reclaimed only when the reference count drops to zero. When a node is +not shared CoW, its memory is reclaimed at a specific point: when its +slot loses the @code{init} predicate. + + +@node Assignment +@subsection Assignment + +Rust has two built-in notions of assignment. + +First, one can @dfn{move} a value between slots. Movement is +written with the operator @code{<-}; an example is @code{B <- A;} +which means "move A to B". All values can be moved. + +Second, one can @dfn{copy} a value between slots. Copying is +written with the operator @code{=}; an example is @code{B = A;} +which means "copy A to B". Limited values cannot be copied. + +In both move and copy assignments, the left slot is called the +@dfn{destination} and the right slot is called the @dfn{source}. + +The affect of copying and movement on typestates is described in +@ref{Predicates and assignment}. In short, predicates associated with +the destination slot are dropped, and predicates associated with the +source slot are either replaced or duplicated to refer to the +destination slot. + +The @code{init} predicate models the initialization state of a slot, +and thus the presence or absence of a slot's reference to a +value. When an @code{init} predicate is dropped during an assignment, +it may thereby drop a reference to a value. This in turn may cause +either finalization -- if the value is limited and contains processes +with finalizers -- and/or storage reclamation, if the unreferenced +value was exterior-allocated (see @ref{Allocation modes}). + +The behavior of a move or copy is completely defined by the typestate +transition rules in @ref{Predicates and assignment}, as well as one +additional rule: that in either @code{B = A;} or @code{B <- A}, if the +prestate has @code{init(A)} and slot @var{A} refers to value @var{V} +before the assignment, then slot @var{B} refers to value @var{V} after +the assignment. + + +@node Allocation modes +@subsection Allocation modes + +Values can be allocated in either an @i{exterior} or @i{interior} +mode. @dfn{Exterior allocation} typically means ``allocated on the +heap''. @dfn{Interior allocation} means that the value is allocated +within a contiguous block of memory owned by some other +value. Interior allocations are commonly made for slots in @code{rec} +and @code{vec} values, as well as slots in the runtime stack. + +Slots also have modes, each of which may imply that the slot's +referent has a particular allocation mode. See @ref{Slot modes}. + + +@node Value transplanting +@subsection Value transplanting + +When a value is moved or copied into a slot that requires an +allocation mode different from the allocation mode of the value, the +value may be @i{transplanted} during the move or +copy. @dfn{Transplanting} a value means moving or copying the value by +byte-copying it into a new location in memory. Transplanting may also +involve allocating fresh memory from the heap, if the transplant +destination is exterior. + +The slot modes (see @ref{Slot modes}) ensure that directly limited +types are never transplanted. + +Rust's memory model strives to minimize three things simultaneously: + +@itemize +@item The amount of value transplanting. +@item The amount of exterior allocation. +@item The amount users have to think about either of the above. +@end itemize + + +@node Type size +@subsection Type size + +Values consume a particular amount of memory when they are allocated. +The type of a value may, or may not, determine how much memory is +required to store a value of that type. + +If a type is parametric, opaque, or forward-declared (and not yet +undefined) we say that it is of @dfn{indefinite size}. Otherwise we +say that the type has @dfn{definite size}. Since values and slots also +have unique types, we may refer to slots or values as having definite +or indefinite size. + +Note however that under the slot modes (see @ref{Slot modes}), the +space allocated for a slot itself has definite size: slots either +contain values with definite size, or else hold addresses, which have +definite size themselves. Thus the contribution of a slot to its +containing structure's size is always definite. In other words, the +property of ``having indefinite size'' does not ``infect'' containing +structures. + +If a type has definite size that is equal to or smaller than the size +of a machine word (in bits), we say that the type has @dfn{subword +size}. All types with definite sizes larger than a machine word, as +well as types with indefinite size, are said to have @dfn{superword +size}. + + +@node Slot modes +@subsection Slot modes + +Slots have modes. A slot's mode determines how the slot relates to its +referent. There are four slot modes: + +@itemize +@item Immediate +@item Dependent +@item Exterior +@item Interior +@end itemize + +A slot is immediate iff it has subword size. The space allocated to an +immediate slot is also used to contain the value, and every copy is +effectively a ``transplant'' because there is no advantage to copying +less than the full value. + +A slot is dependent iff its type has superword size, but refers to a +value that is statically known to be initialized and constant at every +point where the slot is visible. The most important cases of dependent +slots are non-immediate constant slots for subroutine arguments, +yields and returns, and the arms of @code{alt} statements. Dependent +slots hold addresses of either interior or exterior values, but do +@emph{not} manipulate reference counts, deinitialize or deallocate +values. + +A slot is interior iff its type has definite size, its type is not +directly limited, and the slot is not dependent. Interior slots always +contain their interior allocation; assignment of an exterior value to +an interior slot causes a transplant to the interior storage. Interior +slots are somewhat similar to immediate slots, insofar as their value +is stored within the structure that contains the slot. Interior slots +differ from immediate slots insofar as the @emph{address} of an +interior slot can be stored in a dependent slot. Immediate slots are +always passed by value. + +A slot is exterior iff the slot is not dependent, and either its type +has indefinite size or its type is directly limited. Exterior slots +always contain the address of an exterior allocation; assignment of an +interior value to an exterior slot causes a transplant to exterior +storage. + +@node Index + +@printindex cp + +@bye diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 00000000..59a1a437 --- /dev/null +++ b/src/Makefile @@ -0,0 +1,26 @@ + + +all: parser.cmo parser.cmi lexer.cmo ast.cmo + +.phony: clean + +clean: + rm -f *.cmo *.cmi lexer.ml parser.ml parser.mli + +parser.cmo: parser.ml parser.cmi ast.cmo + ocamlc -c $< + +lexer.cmo: lexer.ml parser.cmi + ocamlc -c $< + +%.cmo: %.ml + ocamlc -c $< + +%.cmi: %.mli + ocamlc -c $< + +%.ml: %.mll + ocamllex $< + +%.ml %.mli: %.mly + ocamlyacc $< diff --git a/src/ast.ml b/src/ast.ml new file mode 100644 index 00000000..acab6a14 --- /dev/null +++ b/src/ast.ml @@ -0,0 +1,336 @@ +open Array;; +open Hashtbl;; + +(* + * There are two kinds of rust files: + * + * .rc files, containing crates. + * .rs files, containing source. + * + *) + +(* Slot names are given by a dot-separated path within the current + module namespace. *) + +type rs_name = string array +;; + + +type ty_prim = + TY_unsigned of int + | TY_signed of int + | TY_ieee_bfp of int + | TY_ieee_dfp of int + | TY_ptr of int (* like unsigned, but no arithmetic ops *) +;; + +type ty_arith = + TY_int + | TY_rat +;; + +type rs_type = + TY_nil + | TY_dyn + | TY_type + + | TY_prim of ty_prim + | TY_arith of ty_arith + | TY_str + | TY_char + + | TY_rec of ty_rec + | TY_alt of ty_alt + | TY_vec of ty_vec + + | TY_func of ty_subr + | TY_iter of ty_subr + | TY_chan of ty_sig + + | TY_prog + | TY_proc + | TY_port of ty_sig + + | TY_pred of ty_pred + | TY_quote of ty_quote + | TY_const of rs_type + +(* + * An fstate may include *optional* names in its args. + * The "formal" name is implied where names are missing, + * which is for example the return / yield value on a subr + * or the vector-element type on a vec. + *) + +and rs_fstate = + { + fstate_name: rs_name; + fstate_args: (rs_name option) array; + } + +and rs_state = + { + state_name: rs_name; + state_args: rs_name array; + } + +and ty_rec = + { + rec_slots: ty_rec_slot array; + rec_state: rs_state; + } + +and ty_rec_slot = + { + rec_slot_name: string; + rec_slot_type: rs_type; + } + +and ty_alt = ty_alt_case array + +and ty_alt_case = + { + alt_case_name: string; + alt_case_rec: ty_rec; + } + +and ty_vec = + { + vec_elt_type: rs_type; + vec_state: rs_fstate; + } + +and ty_subr = + { + subr_inline: bool; + subr_pure: bool; + subr_sig: ty_sig; + } + +and ty_sig = + { + sig_params: rs_type array; + sig_result: rs_type; + sig_istate: rs_state; + sig_ostate: rs_fstate; + } + +and ty_prog = + { + prog_auto: bool; + } + +and ty_pred = + { + pred_auto: bool; + pred_inline: bool; + pred_params: rs_type array; + pred_state: rs_state; + } + +and ty_quote = + TY_quote_expr + | TY_quote_type + | TY_quote_stmt + (* Probably this list should be a lot longer; the canonical rule + I've been using is to make a quotation type for every + nonterminal *) + + +let init_star_fstate : rs_fstate = + { + fstate_name = Array.make 1 "init"; + fstate_args = Array.make 1 None; + } +;; + +(* Values *) + +type val_prim = + VAL_unsigned of int + | VAL_signed of int + | VAL_ieee_bfp of float + | VAL_ieee_dfp of (int * int) + | VAL_ptr of int + +(* + * The "value" type is the result of evaluation of an expression. Or + * seen another way, it is the sort of thing that can be put in a + * slot. + * + * Implementations are required to be able to construct dyns, but + * implementations are *not* required to construct anything fancier + * than dyns. So here our interpreter is defined purely over dyns. + * All our values are dyns at runtime. + * + * Our 'dyn' type, therefore, is only relevant to the static reasoning + * stage; it represents places where the compiler lacks static type + * information. + *) + +type rs_val = + VAL_dyn of (rs_type * rs_val_dyn) + +and rs_val_dyn = + + VAL_nil + | VAL_prim of val_prim + | VAL_arith of Num.num + | VAL_str of string + | VAL_char of char + + | VAL_rec of val_rec + | VAL_alt of val_rec + | VAL_vec of val_vec + + | VAL_func of rs_stmt + | VAL_iter of rs_stmt + | VAL_chan of int + + | VAL_prog of val_prog + | VAL_proc of val_proc + | VAL_port of (val_proc * int) + + | VAL_type of rs_type + | VAL_quote of val_quote + +and val_quote = + + VAL_quote_expr + | VAL_quote_type + | VAL_quote_stmt + +and val_rec = val_rec_slot array + +and val_rec_slot = + { + val_rec_slot_name: string; + val_rec_slot_val: rs_val option; + } + +and val_alt = + { + val_alt_case: ty_alt_case; + val_alt_rec: val_rec; + } + +and val_vec = rs_val array + +and val_prog = + { + prog_init: rs_stmt option; + prog_fini: rs_stmt option; + prog_main: rs_stmt; + prog_decls: rs_decl array; + } + +and val_proc = + { + proc_prog: val_prog; + proc_env: (string, rs_val) Hashtbl.t; + proc_pc: int; + proc_state: proc_exec_state; + proc_ports: int array; + } + +and proc_exec_state = + PROC_RUN + | PROC_RECV + | PROC_SEND + | PROC_FINI + +and rs_stmt = + STMT_while of stmt_while + | STMT_foreach of stmt_foreach + | STMT_for of stmt_for + | STMT_if of stmt_if + | STMT_try of stmt_try + | STMT_yield of (rs_expr option) + | STMT_return of rs_expr + | STMT_block of (rs_stmt array) + | STMT_assert of rs_fstate + | STMT_seti of rs_expr * rs_expr * rs_expr + | STMT_set of rs_expr * rs_expr + +and stmt_while = + { + while_expr: rs_expr; + while_body: rs_stmt; + } + +and stmt_foreach = + { + foreach_bindings: (string * rs_expr) array; + foreach_body: rs_stmt; + } + +and stmt_for = + { + for_init: rs_stmt; + for_test: rs_expr; + for_step: rs_stmt; + for_body: rs_stmt; + } + +and stmt_if = + { + if_test: rs_expr; + if_then: rs_stmt; + if_else: rs_stmt option; + } + +and stmt_try = + { + try_body: rs_stmt; + try_fail: rs_stmt option; + try_fini: rs_stmt option; + } + +and rs_expr = + + EXPR_binary of (rs_binop * rs_expr * rs_expr) + | EXPR_unary of (rs_unop * rs_expr) + | EXPR_literal of rs_val + | EXPR_name of rs_name + +and rs_binop = + + BINOP_or + | BINOP_and + + | BINOP_eq + | BINOP_ne + + | BINOP_lt + | BINOP_le + | BINOP_ge + | BINOP_gt + + | BINOP_lsl + | BINOP_lsr + | BINOP_asr + + | BINOP_add + | BINOP_sub + | BINOP_mul + | BINOP_div + | BINOP_mod + + | BINOP_idx + +and rs_unop = + | UNOP_not + +and rs_decl = + { + decl_name: string; + decl_type: rs_type; + decl_value: rs_val; + decl_state: rs_state; + } +;; + +type rs_decl_top = + PUBLIC of rs_decl + | PRIVATE of rs_decl +;; diff --git a/src/lexer.mll b/src/lexer.mll new file mode 100644 index 00000000..880cdf95 --- /dev/null +++ b/src/lexer.mll @@ -0,0 +1,109 @@ + + +{ + open Parser;; + let keyword_table = Hashtbl.create 100 + let _ = + List.iter (fun (kwd, tok) -> Hashtbl.add keyword_table kwd tok) + [ ("crate", CRATE); + ("module", MODULE); + ("use", USE); + ("pub", PUB); + + ("meta", META); + ("syntax", SYNTAX); + + ("if", IF); + ("else", ELSE); + ("while", WHILE); + ("for", FOR); + + ("try", TRY); + ("fail", FAIL); + ("fini", FINI); + + ("yield", YIELD); + ("ret", RET); + + ("type", TYPE); + ("pred", PRED); + + ("const", CONST); + ("pure", PURE); + + ("auto", AUTO); + ("inline", INLINE); + + ("nil", NIL); + ("ptr", PTR); + + ("int", INT); + ("rat", RAT); + + ("char", CHAR); + ("str", STR); + + ("alt", ALT); + ("vec", VEC); + ("dyn", DYN); + + ("func", FUNC); + ("iter", ITER); + ("chan", CHAN); + + ("proc", PROC); + ("prog", PROG); + ("port", PORT); + + ("spawn", SPAWN); + ("log", LOG); + ("reflect", REFLECT); + ("eval", EVAL); + ] +;; +} + +let bin = "0b" ['0' '1']['0' '1' '_']* +let hex = "0x" ['0'-'9' 'a'-'f' 'A'-'F']['0'-'9' 'a'-'f' 'A'-'F' '_']* +let oct = "0o" ['0'-'7']['0'-'7' '_']* +let dec = ['-' '+']?['0'-'9']* ['.']? ['0'-'9']+ (['e''E']['-''+']?['0'-'9']+)? +let id = ['a'-'z' 'A'-'Z']['a'-'z' 'A'-'Z' '0'-'9' '_']* + +rule token = parse + [ ' ' '\t' '\n' ] { token lexbuf } +| [ '+' ] { PLUS } +| [ '-' ] { MINUS } +| [ '*' ] { STAR } +| [ '/' ] { SLASH } +| [ '%' ] { PERCENT } +| [ '=' ] { ASSIGN } +| "+=" { PLUS_ASSIGN } +| "-=" { MINUS_ASSIGN } +| "*=" { STAR_ASSIGN } +| "/=" { SLASH_ASSIGN } +| "%=" { PERCENT_ASSIGN } +| [ '<' ] { LT } +| "<=" { LE } +| "==" { EQ } +| ">=" { GE } +| [ '>' ] { GT } +| [ '!' ] { NOT } +| [ '&' ] { AND } +| [ '|' ] { OR } +| "<<" { LSL } +| ">>" { LSR } +| ">>>" { ASR } +| [ '.' ] { DOT } +| [ '~' ] { TILDE } +| id as i + { try + Hashtbl.find keyword_table i + with + Not_found -> IDENT i } + +| (bin|oct|hex) as n { LIT_NUM (Num.num_of_int (int_of_string n)) } +| dec as d { LIT_NUM (Num.num_of_string d) } + +| ['"'] (([^'"']|"\\\"")* as s) ['"'] { LIT_STR s } +| ['\''] ( [^'\''] as c) ['\''] { LIT_CHAR c } +| "'\\''" { LIT_CHAR '\'' } diff --git a/src/parser.mly b/src/parser.mly new file mode 100644 index 00000000..2b89309a --- /dev/null +++ b/src/parser.mly @@ -0,0 +1,147 @@ +%{ + + (* Header *) + +open Ast;; + +let numty n = + match n with + Num.Ratio _ -> TY_rat + | _ -> TY_int +;; + + +%} + +/* Declarations */ + +/* Expression nodes that reduce to overridable 2 or 3-operand calls. */ +%token PLUS MINUS STAR SLASH PERCENT +%token ASSIGN PLUS_ASSIGN MINUS_ASSIGN STAR_ASSIGN SLASH_ASSIGN PERCENT_ASSIGN +%token LT LE EQ NE GE GT +%token NOT AND OR LSL LSR ASR + +/* No user-overriding beyond this line. */ +%token DOT + +/* Structural symbols. */ +%token COMMA SEMI COLON LPAREN RPAREN LBRACE RBRACE LBRACKET RBRACKET ARROW + +/* Keywords for the crate and module system. */ +%token CRATE MODULE USE PUB + +/* Metaprogramming keywords. */ +%token SYNTAX META TILDE + +/* Control-flow keywords. */ +%token IF ELSE WHILE FOR +%token TRY FAIL FINI +%token YIELD RET + +/* Type and type-state keywords. */ +%token TYPE PRED + +/* Type qualifiers. */ +%token CONST PURE + +/* Declarator qualifiers. */ +%token PUBLIC AUTO INLINE + +/* Basic types. */ +%token NIL PTR +%token INT RAT +%token CHAR STR +%token BFP DFP SIGNED UNSIGNED + +/* Algebraic type constructors. */ +%token REC ALT VEC DYN + +/* Callable type constructors. */ +%token FUNC ITER CHAN + +/* Process types. */ +%token PROC PROG PORT + +/* Magic runtime services. */ +%token SPAWN LOG REFLECT EVAL + +/* Literals. */ +%token LIT_NUM +%token LIT_STR +%token LIT_CHAR + +/* Identifiers. */ +%token IDENT + +/* Precedences (mostly borrowed from C). */ +%left OR +%left AND +%left EQ NE +%left LT LE GE GT +%left LSL LSR ASR +%left PLUS MINUS +%left STAR SLASH PERCENT +%right NOT + +/* Entries. */ +%start program +%type program +%type expr +%type name +%type name_list +%type literal + +%% + +/* Rules */ + +program: FUNC { 1 } + +expr: + expr OR expr { EXPR_binary (BINOP_or, $1, $3) } + | expr AND expr { EXPR_binary (BINOP_and, $1, $3) } + + | expr LT expr { EXPR_binary (BINOP_lt, $1, $3) } + | expr LE expr { EXPR_binary (BINOP_le, $1, $3) } + | expr GE expr { EXPR_binary (BINOP_ge, $1, $3) } + | expr GT expr { EXPR_binary (BINOP_gt, $1, $3) } + + | expr LSL expr { EXPR_binary (BINOP_lsl, $1, $3) } + | expr LSR expr { EXPR_binary (BINOP_lsr, $1, $3) } + | expr ASR expr { EXPR_binary (BINOP_asr, $1, $3) } + + | expr PLUS expr { EXPR_binary (BINOP_add, $1, $3) } + | expr MINUS expr { EXPR_binary (BINOP_sub, $1, $3) } + + | expr STAR expr { EXPR_binary (BINOP_mul, $1, $3) } + | expr SLASH expr { EXPR_binary (BINOP_div, $1, $3) } + | expr PERCENT expr { EXPR_binary (BINOP_mod, $1, $3) } + + | NOT expr { EXPR_unary (UNOP_not, $2) } + + | literal { EXPR_literal $1 } + | name { EXPR_name $1 } + +literal: + LIT_STR { VAL_dyn (TY_str, VAL_str $1) } + + | LIT_CHAR { VAL_dyn (TY_char, VAL_char $1) } + + | LIT_NUM { VAL_dyn (TY_arith (numty $1), + VAL_arith $1) } + +name: + name_list { Array.of_list $1 } + +name_list: + IDENT DOT name_list { $1 :: $3 } + | IDENT { [$1] } + + +top_defn: + + PUBLIC subr_defn { + +%% + +(* Trailer *) diff --git a/test/foo.rs b/test/foo.rs new file mode 100644 index 00000000..b0ed7824 --- /dev/null +++ b/test/foo.rs @@ -0,0 +1,32 @@ +module foo; +syntax rx, html; + +use bar.baz.(a,b,c); + +type nat = int : positive(*); +type natvec1 = vec[nat]; +type natvec2 = vec[int : positive(*)]; + +pred bounded1(int a, int b, int c) = le(a,b), lt(b,c); +pred bounded2(int a, int b, int c) +{ + ret a <= b & b < c; +} + +const str x = "hello"; +const str y = replace(x, ~rx.pat{(el+)o}, ~rx.sub{$1foo}); +const str z = ~html.doc(4.0, "xhtml") +{ + hello +}; + +iter span(int a, int b) : le(a,b) + -> int : le(a,*), lt(*,b) +{ + int i = a; + while (i < b) + { + yield i; + i += 1; + } +}