Skip to content

Commit

Permalink
(parser) Fix broken support for certain negative integer literals
Browse files Browse the repository at this point in the history
This commit changes the logic to do the parsing of numeric literals at
parse-time instead of scan-time, where we have better access to the
actual context of how the literal is being used. This makes it possible
to avoid making expressions like "-100" be `Expr.UnaryPrefix` and
instead be a simple `Expr.Literal` with the value of the expected type,
even for the edge cases described in #302.

Fixes #302
  • Loading branch information
perlun committed Mar 20, 2022
1 parent 11f48ce commit 8e958f8
Show file tree
Hide file tree
Showing 8 changed files with 407 additions and 87 deletions.
26 changes: 26 additions & 0 deletions src/Perlang.Common/NumericToken.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
using System.Globalization;

namespace Perlang;

public class NumericToken : Token
{
public bool IsFractional { get; }
public Base NumberBase { get; }
public NumberStyles NumberStyles { get; }

public NumericToken(string lexeme, int line, string numberCharacters, bool isFractional, Base numberBase, NumberStyles numberStyles)
: base(TokenType.NUMBER, lexeme, numberCharacters, line)
{
IsFractional = isFractional;
NumberBase = numberBase;
NumberStyles = numberStyles;
}

public enum Base
{
BINARY = 2,
OCTAL = 8,
DECIMAL = 10,
HEXADECIMAL = 16,
}
}
141 changes: 141 additions & 0 deletions src/Perlang.Parser/NumberParser.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
using System;
using System.Globalization;
using System.Numerics;
using Perlang.Extensions;

#nullable enable
namespace Perlang.Parser;

internal static class NumberParser
{
public static object Parse(NumericToken numericToken)
{
string numberCharacters = (string)numericToken.Literal!;

if (numericToken.IsFractional)
{
// TODO: This is a mess. We currently treat all floating point values as _double_, which is insane. We
// TODO: should probably have a "use smallest possible type" logic as below for integers, for floating point
// TODO: values as well. We could also consider supporting `decimal` while we're at it.

// The explicit IFormatProvider is required to ensure we use 123.45 format, regardless of host OS
// language/region settings. See #263 for more details.
return Double.Parse(numberCharacters, CultureInfo.InvariantCulture);
}
else
{
// Any potential preceding '-' character has already been taken care of at this stage => we can treat
// the number as an unsigned value. However, we still try to coerce it to the smallest signed or
// unsigned integer type in which it will fit (but never smaller than 32-bit). This coincidentally
// follows the same semantics as how C# does it, for simplicity.

BigInteger value = numericToken.NumberBase switch
{
NumericToken.Base.DECIMAL =>
BigInteger.Parse(numberCharacters, numericToken.NumberStyles),

NumericToken.Base.BINARY =>
Convert.ToUInt64(numberCharacters, 2),

NumericToken.Base.OCTAL =>
Convert.ToUInt64(numberCharacters, 8),

NumericToken.Base.HEXADECIMAL =>

// Quoting from
// https://docs.microsoft.com/en-us/dotnet/api/system.numerics.biginteger.parse?view=net-5.0#System_Numerics_BigInteger_Parse_System_ReadOnlySpan_System_Char__System_Globalization_NumberStyles_System_IFormatProvider_
//
// If value is a hexadecimal string, the Parse(String, NumberStyles) method interprets value as a
// negative number stored by using two's complement representation if its first two hexadecimal
// digits are greater than or equal to 0x80. In other words, the method interprets the highest-order
// bit of the first byte in value as the sign bit. To make sure that a hexadecimal string is
// correctly interpreted as a positive number, the first digit in value must have a value of zero.
//
// We presume that all hexadecimals should be treated as positive numbers for now.
BigInteger.Parse('0' + numberCharacters, numericToken.NumberStyles),

_ =>
throw new InvalidOperationException($"Base {(int)numericToken.NumberBase} not supported")
};

if (value <= Int32.MaxValue)
{
return (int)value;
}
else if (value <= UInt32.MaxValue)
{
return (uint)value;
}
else if (value <= Int64.MaxValue)
{
return (long)value;
}
else if (value <= UInt64.MaxValue)
{
return (ulong)value;
}
else // Anything else remains a BigInteger
{
return value;
}
}
}

public static object MakeNegative(object value)
{
if (value is double doubleValue)
{
return -doubleValue;
}
else if (value is int doubleInt)
{
return -doubleInt;
}
else if (value is uint doubleUint)
{
long negativeValue = -doubleUint;

// This is a special hack to ensure that the value -2147483648 gets returned as an `int` and not a `long`.
// Some details available in #302, summarized here in brief:
//
// The value 2147483648 is too large for an `int` => gets parsed into a `ulong` where it will fit. Once it
// has been made negative, the value -2147483648 is again small enough to fit in an `int` => the code below
// will narrow it down to comply with the "smallest type possible" design principle.
//
// Rationale: Two's complement: https://en.wikipedia.org/wiki/Two%27s_complement
if (negativeValue >= Int32.MinValue)
{
return (int)negativeValue;
}
else
{
return negativeValue;
}
}
else if (value is long longValue)
{
return -longValue;
}
else if (value is ulong ulongValue)
{
// Again, this needs to be handled specially to ensure that numbers that fit in a `long` doesn't use
// BigInteger unnecessarily.
BigInteger negativeValue = -new BigInteger(ulongValue);

if (negativeValue >= Int64.MinValue)
{
return (long)negativeValue;
}
else
{
// All negative numbers that are too big to fit in any of the smaller signed integer types will go
// through this code path.
return negativeValue;
}
}
else
{
throw new ArgumentException($"Type {value.GetType().ToTypeKeyword()} not supported");
}
}
}
23 changes: 21 additions & 2 deletions src/Perlang.Parser/PerlangParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -564,7 +564,18 @@ private Expr UnaryPrefix()
{
Token @operator = Previous();
Expr right = UnaryPrefix();
return new Expr.UnaryPrefix(@operator, right);

// We detect MINUS + NUMBER here and convert it to a single Expr.Literal() (with constant value) instead
// of retaining it as a unary prefix expression. See #302 for some more details about why this was
// changed.
if (@operator.Type == MINUS && right is Expr.Literal rightLiteral)
{
return new Expr.Literal(NumberParser.MakeNegative(rightLiteral.Value!));
}
else
{
return new Expr.UnaryPrefix(@operator, right);
}
}

return Call();
Expand Down Expand Up @@ -623,7 +634,15 @@ private Expr Primary()
if (Match(TRUE)) return new Expr.Literal(true);
if (Match(NULL)) return new Expr.Literal(null);

if (Match(NUMBER, STRING))
if (Match(NUMBER))
{
// Numbers are retained as strings in the scanning phase, to properly be able to parse negative numbers
// in the parsing stage (where we can more easily join the MINUS and NUMBER token together). See #302
// for details.
return new Expr.Literal(NumberParser.Parse((NumericToken)Previous()));
}

if (Match(STRING))
{
return new Expr.Literal(Previous().Literal);
}
Expand Down
94 changes: 15 additions & 79 deletions src/Perlang.Parser/Scanner.cs
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
// TODO: Remove once https://github.com/DotNetAnalyzers/StyleCopAnalyzers/issues/3392 has been resolved

#pragma warning disable SA1515 // SingleLineCommentMustBePrecededByBlankLine

using System;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.Globalization;
using System.Linq;
using System.Numerics;
using System.Text;
using static Perlang.TokenType;

Expand Down Expand Up @@ -350,7 +350,7 @@ private void ScanToken()
default:
// Even if a number is specified in a different base than 10 (e.g. binary, hexadecimal etc), it
// always starts with a "normal" (decimal) digit because of the prefix characters - e.g. 0x1234.
if (IsDigit(c, Base.DECIMAL))
if (IsDigit(c, NumericToken.Base.DECIMAL))
{
Number();
}
Expand Down Expand Up @@ -385,7 +385,7 @@ private void Number()
{
bool isFractional = false;
var numberStyles = NumberStyles.Any;
var numberBase = Base.DECIMAL;
var numberBase = NumericToken.Base.DECIMAL;
int startOffset = 0;

char currentChar = Char.ToLower(Peek());
Expand All @@ -395,16 +395,16 @@ private void Number()
switch (currentChar)
{
case 'b':
numberBase = Base.BINARY;
numberBase = NumericToken.Base.BINARY;
break;

case 'o':
numberBase = Base.OCTAL;
numberBase = NumericToken.Base.OCTAL;
break;

case 'x':
numberStyles = NumberStyles.HexNumber;
numberBase = Base.HEXADECIMAL;
numberBase = NumericToken.Base.HEXADECIMAL;
break;
}

Expand Down Expand Up @@ -436,72 +436,11 @@ private void Number()

string numberCharacters = RemoveUnderscores(source[(start + startOffset)..current]);

if (isFractional)
{
// TODO: This is a mess. We currently treat all floating point values as _double_, which is insane. We
// TODO: should probably have a "use smallest possible type" logic as below for integers, for floating point
// TODO: values as well. We could also consider supporting `decimal` while we're at it.

// The explicit IFormatProvider is required to ensure we use 123.45 format, regardless of host OS
// language/region settings. See #263 for more details.
AddToken(NUMBER, Double.Parse(numberCharacters, CultureInfo.InvariantCulture));
}
else
{
// Any potential preceding '-' character has already been taken care of at this stage => we can treat
// the number as an unsigned value. However, we still try to coerce it to the smallest signed or
// unsigned integer type in which it will fit (but never smaller than 32-bit). This coincidentally
// follows the same semantics as how C# does it, for simplicity.

BigInteger value = numberBase switch
{
Base.DECIMAL =>
BigInteger.Parse(numberCharacters, numberStyles),

Base.BINARY =>
Convert.ToUInt64(numberCharacters, 2),

Base.OCTAL =>
Convert.ToUInt64(numberCharacters, 8),

Base.HEXADECIMAL =>
// Quoting from
// https://docs.microsoft.com/en-us/dotnet/api/system.numerics.biginteger.parse?view=net-5.0#System_Numerics_BigInteger_Parse_System_ReadOnlySpan_System_Char__System_Globalization_NumberStyles_System_IFormatProvider_
//
// If value is a hexadecimal string, the Parse(String, NumberStyles) method interprets value as a
// negative number stored by using two's complement representation if its first two hexadecimal
// digits are greater than or equal to 0x80. In other words, the method interprets the highest-order
// bit of the first byte in value as the sign bit. To make sure that a hexadecimal string is
// correctly interpreted as a positive number, the first digit in value must have a value of zero.
//
// We presume that all hexadecimals should be treated as positive numbers for now.
BigInteger.Parse('0' + numberCharacters, numberStyles),

_ =>
throw new InvalidOperationException($"Base {(int)numberBase} not supported")
};

if (value < Int32.MaxValue)
{
AddToken(NUMBER, (int)value);
}
else if (value < UInt32.MaxValue)
{
AddToken(NUMBER, (uint)value);
}
else if (value < Int64.MaxValue)
{
AddToken(NUMBER, (long)value);
}
else if (value < UInt64.MaxValue)
{
AddToken(NUMBER, (ulong)value);
}
else // Anything else remains a BigInteger
{
AddToken(NUMBER, value);
}
}
// Note that numbers are not parsed at this stage. We deliberately postpone it to the parsing stage, to be
// able to conjoin MINUS and NUMBER tokens together for negative numbers. The previous approach (inherited
// from Lox) worked poorly with our idea of "narrowing down" constants to smallest possible integer. See
// #302 for some more details.
AddToken(new NumericToken(source[start..current], line, numberCharacters, isFractional, numberBase, numberStyles));
}

private static string RemoveUnderscores(string s)
Expand Down Expand Up @@ -604,9 +543,9 @@ private static bool IsUnderscore(char c) =>
c is '_';

private static bool IsAlphaNumeric(char c) =>
IsAlpha(c) || IsUnderscore(c) || IsDigit(c, Base.DECIMAL);
IsAlpha(c) || IsUnderscore(c) || IsDigit(c, NumericToken.Base.DECIMAL);

private static bool IsDigit(char c, Base @base) =>
private static bool IsDigit(char c, NumericToken.Base @base) =>
(int)@base switch
{
2 => c is '0' or '1',
Expand All @@ -631,12 +570,9 @@ private void AddToken(TokenType type, object literal = null)
tokens.Add(new Token(type, text, literal, line));
}

private enum Base
private void AddToken(Token token)
{
BINARY = 2,
OCTAL = 8,
DECIMAL = 10,
HEXADECIMAL = 16,
tokens.Add(token);
}
}
}
Loading

0 comments on commit 8e958f8

Please sign in to comment.