Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

decouple parser from PNode #425

Merged
merged 1 commit into from
Sep 4, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 43 additions & 1 deletion compiler/ast/ast.nim
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,23 @@ import
ast_types, # Main ast type definitions
ast_idgen, # Per module Id generation
ast_query, # querying/reading the ast
ast_parsed_types, # Data types for the parsed node
lexer, # NumericalBase
],
compiler/front/[
options
],
compiler/utils/[
ropes,
astrepr,
int128 # Values for integer nodes
],
std/[
strutils,
tables # For symbol table mapping
]

export ast_types, ast_idgen, ast_query, int128
export ast_types, ast_idgen, ast_query, int128, ast_parsed_types

var ggDebug* {.deprecated.}: bool ## convenience switch for trying out things

Expand Down Expand Up @@ -604,3 +607,42 @@ proc toHumanStr*(kind: TSymKind): string =
proc toHumanStr*(kind: TTypeKind): string =
## strips leading `tk`
result = toHumanStrImpl(kind, 2)


proc setBaseFlags(n: PNode, base: NumericalBase) =
haxscramper marked this conversation as resolved.
Show resolved Hide resolved
case base
of base10: discard
of base2: incl(n.flags, nfBase2)
of base8: incl(n.flags, nfBase8)
of base16: incl(n.flags, nfBase16)


proc toPNode*(parsed: ParsedNode): PNode =
result = newNodeI(parsed.kind, parsed.info)
result.comment = parsed.comment
case parsed.kind:
of nkFloatKinds:
result.floatVal = parsed.token.fNumber
result.setBaseFlags(parsed.token.base)

of nkIntKinds - { nkCharLit }:
result.intVal = parsed.token.iNumber
result.setBaseFlags(parsed.token.base)

of nkCharLit:
result.intVal = ord(parsed.token.literal[0])

of nkStrKinds:
result.strVal = parsed.token.literal

of nkIdent:
result.ident = parsed.token.ident

else:
if parsed.isBlockArg:
result.flags.incl nfBlockArg


haxscramper marked this conversation as resolved.
Show resolved Hide resolved
for sub in items(parsed):
result.add sub.toPNode()

111 changes: 111 additions & 0 deletions compiler/ast/ast_parsed_types.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
## Data structure for the parser results

import
compiler/ast/[
ast_types, # For the node kinds
lexer # For the token type definition
]

# NOTE further refactoring considerations for the parser
haxscramper marked this conversation as resolved.
Show resolved Hide resolved
#
# - store everything in tokens, do not require identifier interning for any
# purposes during the parsing stage, it must be done later, during
# conversion to a PNode. This will simplify some parts of the type
# definition.
# - remove nim"pretty" - this is an absolute joke of implementation and
# it should not be placed where it is now.

type
ParsedNode* = ref object
# NOTE next two fields are very large combined, but further plans will
# deal with that problem - current implementation is easier to write
# and it is just a transition point.
info*: TLineInfo # TODO replace line and separate token with index to
# the token, which in turn will store information
# about global positioning (tuple made up of a token
# id and a file ID)
#
# NOTE technically this is not really necessary even
# with the current implementation, but the parser
# consistently copies this information around anyway,
# so I will let it stay this way for now.
token*: Token # TODO Replace full token value with an index information
kind*: TNodeKind # NOTE/QUESTION - for now the same kind of nodes is
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, or at least one will be a super set of the other. Ultimately separate enums are likely best.

haxscramper marked this conversation as resolved.
Show resolved Hide resolved
# reused as the main parser, to ease the transition,
# but in the future two different sets of node kinds
# might(?) be introduced.

# TODO replace `ref` object tree with begin/end ranges for the nested
# trees in the linearized structure.
sons*: seq[ParsedNode]
comment*: string # TODO this should either be a token or a sequence of
# tokens.

# HACK explicit flags in order to track down all 'extra' information
# that is collected during parsing.
isBlockArg*: bool # QUESTION add 'nkStmtListBlockArg' or similar node
Copy link
Collaborator

@saem saem Sep 1, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You found it too!

OK, so this dumb thing is used to tell the semantic analysis layer about a statement list (possibly also block) that are being passed to a macro, template, whatever. During semantic analysis we collapse/flatten nested statement lists into one level, but we also do statement list unwrapping, so if there is only one node in a statement list we unwrap it except if there is a defer (I think this might be a bug/design flaw).

That first flattening is guarded by this dumb flag. I personally think we should always flatten, no matter what, but never unwrap in semStmtList (it's the last/second last proc in semstmts, I rewrote it recently so it should be readable). Without that guard a macro can receive arbitrary chunks of AST not always wrapped in a statement list and it makes the really annoying/painful to write.

I do believe between fixing the unpacking logic and the AST the parser generates this shouldn't be a problem. My guess at the most correct fix is sematic analysis layer should:

  • unwrap as determined by the receiving node, so be done on add/index assign based on receiving node kind
  • flattening should always take place in semantic analysis of a statement list because that's just always right

Then this dumb guard should not be necessary. Separating the AST used by sem, parsing, and backend means we can change add and index assign behavior and it only impacts sem so we can do destination based unwrapping rather than have it live incorrectly in semStmtList. 🎉

Sorry that was a rather lengthy explanation.

This comment was marked as resolved.

# and convert it to the `nkStmtList` + `nfBlocArg`
# flags later on? Why do we need the `nfBlockArg`
# flag in the first place?
haxscramper marked this conversation as resolved.
Show resolved Hide resolved

func len*(node: ParsedNode): int =
## Number of the sons of a parsed node
haxscramper marked this conversation as resolved.
Show resolved Hide resolved
return node.sons.len()

# NOTE added for the sake of API similarity between PNode
proc safeLen*(node: ParsedNode): int = node.len()
haxscramper marked this conversation as resolved.
Show resolved Hide resolved

proc `[]`*(node: ParsedNode, idx: int | BackwardsIndex): ParsedNode =
return node.sons[idx]

proc `[]=`*(node: ParsedNode, idx: int | BackwardsIndex, other: ParsedNode) =
node.sons[idx] = other

iterator items*(node: ParsedNode): ParsedNode =
for item in node.sons.items():
yield item

iterator pairs*(node: ParsedNode): (int, ParsedNode) =
for idx, item in pairs(node.sons):
yield (idx, item)

proc add*(node: ParsedNode, other: ParsedNode) =
## Add new element to the sons
haxscramper marked this conversation as resolved.
Show resolved Hide resolved
node.sons.add(other)

proc transitionSonsKind*(n: ParsedNode, kind: TNodeKind) =
haxscramper marked this conversation as resolved.
Show resolved Hide resolved
n.kind = kind

proc transitionIntKind*(n: ParsedNode, kind: TNodeKind) =
n.kind = kind

proc transitionNoneToSym*(n: ParsedNode) =
n.kind = nkSym

func newParsedNode*(kind: TNodeKind): ParsedNode =
## Create a new parsed node without any location or token information
return ParsedNode(kind: kind, info: unknownLineInfo)

func newParsedNode*(
kind: TNodeKind, info: TLineInfo, sons: seq[ParsedNode] = @[]): ParsedNode =
## Create a new non-leaf parsed node with a specified location
## information and sons.
return ParsedNode(kind: kind, info: info, sons: sons)

func newParsedNode*(kind: TNodeKind, info: TLineInfo, token: Token): ParsedNode =
## Create a new leaf parsed node with the specified location information
haxscramper marked this conversation as resolved.
Show resolved Hide resolved
## and token kind.
return ParsedNode(kind: kind, info: info, token: token)


proc newProcNode*(
kind: TNodeKind,
info: TLineInfo,
body, params, name, pattern, genericParams,
pragmas, exceptions: ParsedNode
): ParsedNode =

result = newParsedNode(
kind,
info,
@[name, pattern, genericParams, params, pragmas, exceptions, body])
2 changes: 2 additions & 0 deletions compiler/ast/ast_query.nim
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ const
callableDefs* = nkLambdaKinds + routineDefs

nkSymChoices* = {nkClosedSymChoice, nkOpenSymChoice}
nkFloatKinds* = nkFloatLiterals # QUESTION remove float literals
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think so, but maybe I'm missing some key point? Whatchya thinking?

Copy link
Collaborator Author

@haxscramper haxscramper Sep 1, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have node kinds for everything and only float literals. It is a naming question, and it is a set of node kinds, not a set of literals. Also nkLiterals is inconsistent with Atomic kinds in the macros.nim, but that's another question. I think nkLiteralKinds, nkFloatKinds or even nkFloatLiteralKinds make more sense (the last one is the most sensible IMO, but a bit longer)

TBH I always found this enum set definitions a bit weirdly structured and lacking in some places, but it is hard to see the underlying structure immediately from the code, it must be an incremental improvement

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

proc `[]`*(node: PNode, pos: NodePosName): PNode =
I think those two must be intrinsically related to each other (the collection of the enum kind sets and the [] named indexing operator), but for now that's just a new idea I got while elaborating on the "literals/kinds" distinction above

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For nkFloatLiterals vs nkFloatKinds, the former is literals only and the latter might include more, but I can't think of a single thing not covered by the former. The is a difference in meaning but I'm really hard pressed to think of a case where they would not be equivalent. I would check the sites and consolidate personally.


proc `[]`*(node: PNode, pos: NodePosName): PNode =

I think those two must be intrinsically related to each other (the collection of the enum kind sets and the [] named indexing operator), but for now that's just a new idea I got while elaborating on the "literals/kinds" distinction above

That's exactly the types of insights that form by looking at this stuff.

I agree with the sentiment, the naming and categorization needs to make sense based on purpose and not solely because a new programmer's first instincts are to lump them together.

So to recap:

  • naming needs to be more consistent
  • naming needs to better convey intention
  • sets need to serve a purpose(s), eg: many sites in the compiler need to handle all literals one way (nkLiterals) or lots of literal handling is weird about floats vs other numbers (nkFloatLiterals)

Doesn't have to be those precise names but I think we're on the same page. For actually changing sets carefully looking through their usage is required because there are unfortunate subtleties.

# altogether?
nkStrKinds* = {nkStrLit..nkTripleStrLit}
nkIntKinds* = {nkCharLit .. nkUInt64Lit}

Expand Down
4 changes: 4 additions & 0 deletions compiler/ast/ast_types.nim
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ type
offsetA*, offsetB*: int
commentOffsetA*, commentOffsetB*: int

const
InvalidFileIdx* = FileIndex(-1)
unknownLineInfo* = TLineInfo(line: 0, col: -1, fileIndex: InvalidFileIdx)

type
TCallingConvention* = enum
ccNimCall = "nimcall" ## nimcall, also the default
Expand Down
1 change: 1 addition & 0 deletions compiler/ast/lexer.nim
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,7 @@ proc getNumber(L: var Lexer, result: var Token) =
# XXX: Test this on big endian machine!
of tkFloat64Lit, tkFloatLit:
setNumber result.fNumber, (cast[PFloat64](addr(xi)))[]

haxscramper marked this conversation as resolved.
Show resolved Hide resolved
else:
L.config.internalError(getLineInfo(L), rintIce, "getNumber")

Expand Down
11 changes: 1 addition & 10 deletions compiler/ast/lineinfos.nim
Original file line number Diff line number Diff line change
Expand Up @@ -78,12 +78,7 @@ proc computeNotesVerbosity(): tuple[
}

when defined(nimDebugUtils):
result.base.incl {
rdbgTraceStart, # Begin report
rdbgTraceStep, # in/out
rdbgTraceLine,
rdbgTraceEnd # End report
}
result.base.incl repDebugTraceKinds

result.main[compVerbosityMax] = result.base + repWarningKinds + repHintKinds - {
rsemObservableStores,
Expand Down Expand Up @@ -201,10 +196,6 @@ proc hash*(i: TLineInfo): Hash =
proc raiseRecoverableError*(msg: string) {.noinline.} =
raise newException(ERecoverableError, msg)

const
InvalidFileIdx* = FileIndex(-1)
unknownLineInfo* = TLineInfo(line: 0, col: -1, fileIndex: InvalidFileIdx)

func isKnown*(info: TLineInfo): bool =
## Check if `info` represents valid source file location
info != unknownLineInfo
Expand Down
Loading