(parser) Fix broken support for certain negative integer literals

This commit changes the logic to do the parsing of numeric literals at parse-time instead of scan-time, where we have better access to the actual context of how the literal is being used. This makes it possible to avoid making expressions like "-100" be `Expr.UnaryPrefix` and instead be a simple `Expr.Literal` with the value of the expected type, even for the edge cases described in #302. Fixes #302
perlang-org · Mar 21, 2022 · c98811d · c98811d
1 parent 11f48ce
commit c98811d
Show file tree

Hide file tree

Showing 9 changed files with 319 additions and 87 deletions.
diff --git a/release-notes/v0.2.0.md b/release-notes/v0.2.0.md
@@ -6,4 +6,8 @@
 ### Changed
 - This release does not contain any breaking changes.
 
+### Fixed
+- Fix broken support for certain negative integer literals. [[#302][302]]
+
 [300]: https://github.com/perlang-org/perlang/pull/300
+[302]: https://github.com/perlang-org/perlang/issues/302
diff --git a/src/Perlang.Common/NumericToken.cs b/src/Perlang.Common/NumericToken.cs
@@ -0,0 +1,26 @@
+using System.Globalization;
+
+namespace Perlang;
+
+public class NumericToken : Token
+{
+    public bool IsFractional { get; }
+    public Base NumberBase { get; }
+    public NumberStyles NumberStyles { get; }
+
+    public NumericToken(string lexeme, int line, string numberCharacters, bool isFractional, Base numberBase, NumberStyles numberStyles)
+        : base(TokenType.NUMBER, lexeme, numberCharacters, line)
+    {
+        IsFractional = isFractional;
+        NumberBase = numberBase;
+        NumberStyles = numberStyles;
+    }
+
+    public enum Base
+    {
+        BINARY = 2,
+        OCTAL = 8,
+        DECIMAL = 10,
+        HEXADECIMAL = 16,
+    }
+}
diff --git a/src/Perlang.Parser/NumberParser.cs b/src/Perlang.Parser/NumberParser.cs
@@ -0,0 +1,141 @@
+using System;
+using System.Globalization;
+using System.Numerics;
+using Perlang.Extensions;
+
+#nullable enable
+namespace Perlang.Parser;
+
+internal static class NumberParser
+{
+    public static object Parse(NumericToken numericToken)
+    {
+        string numberCharacters = (string)numericToken.Literal!;
+
+        if (numericToken.IsFractional)
+        {
+            // TODO: This is a mess. We currently treat all floating point values as _double_, which is insane. We
+            // TODO: should probably have a "use smallest possible type" logic as below for integers, for floating point
+            // TODO: values as well. We could also consider supporting `decimal` while we're at it.
+
+            // The explicit IFormatProvider is required to ensure we use 123.45 format, regardless of host OS
+            // language/region settings. See #263 for more details.
+            return Double.Parse(numberCharacters, CultureInfo.InvariantCulture);
+        }
+        else
+        {
+            // Any potential preceding '-' character has already been taken care of at this stage => we can treat
+            // the number as an unsigned value. However, we still try to coerce it to the smallest signed or
+            // unsigned integer type in which it will fit (but never smaller than 32-bit). This coincidentally
+            // follows the same semantics as how C# does it, for simplicity.
+
+            BigInteger value = numericToken.NumberBase switch
+            {
+                NumericToken.Base.DECIMAL =>
+                    BigInteger.Parse(numberCharacters, numericToken.NumberStyles),
+
+                NumericToken.Base.BINARY =>
+                    Convert.ToUInt64(numberCharacters, 2),
+
+                NumericToken.Base.OCTAL =>
+                    Convert.ToUInt64(numberCharacters, 8),
+
+                NumericToken.Base.HEXADECIMAL =>
+
+                    // Quoting from
+                    // https://docs.microsoft.com/en-us/dotnet/api/system.numerics.biginteger.parse?view=net-5.0#System_Numerics_BigInteger_Parse_System_ReadOnlySpan_System_Char__System_Globalization_NumberStyles_System_IFormatProvider_
+                    //
+                    // If value is a hexadecimal string, the Parse(String, NumberStyles) method interprets value as a
+                    // negative number stored by using two's complement representation if its first two hexadecimal
+                    // digits are greater than or equal to 0x80. In other words, the method interprets the highest-order
+                    // bit of the first byte in value as the sign bit. To make sure that a hexadecimal string is
+                    // correctly interpreted as a positive number, the first digit in value must have a value of zero.
+                    //
+                    // We presume that all hexadecimals should be treated as positive numbers for now.
+                    BigInteger.Parse('0' + numberCharacters, numericToken.NumberStyles),
+
+                _ =>
+                    throw new InvalidOperationException($"Base {(int)numericToken.NumberBase} not supported")
+            };
+
+            if (value <= Int32.MaxValue)
+            {
+                return (int)value;
+            }
+            else if (value <= UInt32.MaxValue)
+            {
+                return (uint)value;
+            }
+            else if (value <= Int64.MaxValue)
+            {
+                return (long)value;
+            }
+            else if (value <= UInt64.MaxValue)
+            {
+                return (ulong)value;
+            }
+            else // Anything else remains a BigInteger
+            {
+                return value;
+            }
+        }
+    }
+
+    public static object MakeNegative(object value)
+    {
+        if (value is double doubleValue)
+        {
+            return -doubleValue;
+        }
+        else if (value is int doubleInt)
+        {
+            return -doubleInt;
+        }
+        else if (value is uint doubleUint)
+        {
+            long negativeValue = -doubleUint;
+
+            // This is a special hack to ensure that the value -2147483648 gets returned as an `int` and not a `long`.
+            // Some details available in #302, summarized here in brief:
+            //
+            // The value 2147483648 is too large for an `int` => gets parsed into a `ulong` where it will fit. Once it
+            // has been made negative, the value -2147483648 is again small enough to fit in an `int` => the code below
+            // will narrow it down to comply with the "smallest type possible" design principle.
+            //
+            // Rationale: Two's complement: https://en.wikipedia.org/wiki/Two%27s_complement
+            if (negativeValue >= Int32.MinValue)
+            {
+                return (int)negativeValue;
+            }
+            else
+            {
+                return negativeValue;
+            }
+        }
+        else if (value is long longValue)
+        {
+            return -longValue;
+        }
+        else if (value is ulong ulongValue)
+        {
+            // Again, this needs to be handled specially to ensure that numbers that fit in a `long` doesn't use
+            // BigInteger unnecessarily.
+            BigInteger negativeValue = -new BigInteger(ulongValue);
+
+            if (negativeValue >= Int64.MinValue)
+            {
+                return (long)negativeValue;
+            }
+            else
+            {
+                // All negative numbers that are too big to fit in any of the smaller signed integer types will go
+                // through this code path.
+                return negativeValue;
+            }
+        }
+        else
+        {
+            throw new ArgumentException($"Type {value.GetType().ToTypeKeyword()} not supported");
+        }
+    }
+}
diff --git a/src/Perlang.Parser/PerlangParser.cs b/src/Perlang.Parser/PerlangParser.cs
@@ -564,7 +564,18 @@ private Expr UnaryPrefix()
             {
                 Token @operator = Previous();
                 Expr right = UnaryPrefix();
-                return new Expr.UnaryPrefix(@operator, right);
+
+                // We detect MINUS + NUMBER here and convert it to a single Expr.Literal() (with constant value) instead
+                // of retaining it as a unary prefix expression. See #302 for some more details about why this was
+                // changed.
+                if (@operator.Type == MINUS && right is Expr.Literal rightLiteral)
+                {
+                    return new Expr.Literal(NumberParser.MakeNegative(rightLiteral.Value!));
+                }
+                else
+                {
+                    return new Expr.UnaryPrefix(@operator, right);
+                }
             }
 
             return Call();
@@ -623,7 +634,15 @@ private Expr Primary()
             if (Match(TRUE)) return new Expr.Literal(true);
             if (Match(NULL)) return new Expr.Literal(null);
 
-            if (Match(NUMBER, STRING))
+            if (Match(NUMBER))
+            {
+                // Numbers are retained as strings in the scanning phase, to properly be able to parse negative numbers
+                // in the parsing stage (where we can more easily join the MINUS and NUMBER token together). See #302
+                // for details.
+                return new Expr.Literal(NumberParser.Parse((NumericToken)Previous()));
+            }
+
+            if (Match(STRING))
             {
                 return new Expr.Literal(Previous().Literal);
             }

diff --git a/src/Perlang.Parser/Scanner.cs b/src/Perlang.Parser/Scanner.cs
@@ -1,12 +1,12 @@
 // TODO: Remove once https://github.com/DotNetAnalyzers/StyleCopAnalyzers/issues/3392 has been resolved
+
 #pragma warning disable SA1515 // SingleLineCommentMustBePrecededByBlankLine
 
 using System;
 using System.Collections.Generic;
 using System.Collections.Immutable;
 using System.Globalization;
 using System.Linq;
-using System.Numerics;
 using System.Text;
 using static Perlang.TokenType;
 
@@ -350,7 +350,7 @@ private void ScanToken()
                 default:
                     // Even if a number is specified in a different base than 10 (e.g. binary, hexadecimal etc), it
                     // always starts with a "normal" (decimal) digit because of the prefix characters - e.g. 0x1234.
-                    if (IsDigit(c, Base.DECIMAL))
+                    if (IsDigit(c, NumericToken.Base.DECIMAL))
                     {
                         Number();
                     }
@@ -385,7 +385,7 @@ private void Number()
         {
             bool isFractional = false;
             var numberStyles = NumberStyles.Any;
-            var numberBase = Base.DECIMAL;
+            var numberBase = NumericToken.Base.DECIMAL;
             int startOffset = 0;
 
             char currentChar = Char.ToLower(Peek());
@@ -395,16 +395,16 @@ private void Number()
                 switch (currentChar)
                 {
                     case 'b':
-                        numberBase = Base.BINARY;
+                        numberBase = NumericToken.Base.BINARY;
                         break;
 
                     case 'o':
-                        numberBase = Base.OCTAL;
+                        numberBase = NumericToken.Base.OCTAL;
                         break;
 
                     case 'x':
                         numberStyles = NumberStyles.HexNumber;
-                        numberBase = Base.HEXADECIMAL;
+                        numberBase = NumericToken.Base.HEXADECIMAL;
                         break;
                 }
 
@@ -436,72 +436,11 @@ private void Number()
 
             string numberCharacters = RemoveUnderscores(source[(start + startOffset)..current]);
 
-            if (isFractional)
-            {
-                // TODO: This is a mess. We currently treat all floating point values as _double_, which is insane. We
-                // TODO: should probably have a "use smallest possible type" logic as below for integers, for floating point
-                // TODO: values as well. We could also consider supporting `decimal` while we're at it.
-
-                // The explicit IFormatProvider is required to ensure we use 123.45 format, regardless of host OS
-                // language/region settings. See #263 for more details.
-                AddToken(NUMBER, Double.Parse(numberCharacters, CultureInfo.InvariantCulture));
-            }
-            else
-            {
-                // Any potential preceding '-' character has already been taken care of at this stage => we can treat
-                // the number as an unsigned value. However, we still try to coerce it to the smallest signed or
-                // unsigned integer type in which it will fit (but never smaller than 32-bit). This coincidentally
-                // follows the same semantics as how C# does it, for simplicity.
-
-                BigInteger value = numberBase switch
-                {
-                    Base.DECIMAL =>
-                        BigInteger.Parse(numberCharacters, numberStyles),
-
-                    Base.BINARY =>
-                        Convert.ToUInt64(numberCharacters, 2),
-
-                    Base.OCTAL =>
-                        Convert.ToUInt64(numberCharacters, 8),
-
-                    Base.HEXADECIMAL =>
-                        // Quoting from
-                        // https://docs.microsoft.com/en-us/dotnet/api/system.numerics.biginteger.parse?view=net-5.0#System_Numerics_BigInteger_Parse_System_ReadOnlySpan_System_Char__System_Globalization_NumberStyles_System_IFormatProvider_
-                        //
-                        // If value is a hexadecimal string, the Parse(String, NumberStyles) method interprets value as a
-                        // negative number stored by using two's complement representation if its first two hexadecimal
-                        // digits are greater than or equal to 0x80. In other words, the method interprets the highest-order
-                        // bit of the first byte in value as the sign bit. To make sure that a hexadecimal string is
-                        // correctly interpreted as a positive number, the first digit in value must have a value of zero.
-                        //
-                        // We presume that all hexadecimals should be treated as positive numbers for now.
-                        BigInteger.Parse('0' + numberCharacters, numberStyles),
-
-                    _ =>
-                        throw new InvalidOperationException($"Base {(int)numberBase} not supported")
-                };
-
-                if (value < Int32.MaxValue)
-                {
-                    AddToken(NUMBER, (int)value);
-                }
-                else if (value < UInt32.MaxValue)
-                {
-                    AddToken(NUMBER, (uint)value);
-                }
-                else if (value < Int64.MaxValue)
-                {
-                    AddToken(NUMBER, (long)value);
-                }
-                else if (value < UInt64.MaxValue)
-                {
-                    AddToken(NUMBER, (ulong)value);
-                }
-                else // Anything else remains a BigInteger
-                {
-                    AddToken(NUMBER, value);
-                }
-            }
+            // Note that numbers are not parsed at this stage. We deliberately postpone it to the parsing stage, to be
+            // able to conjoin MINUS and NUMBER tokens together for negative numbers. The previous approach (inherited
+            // from Lox) worked poorly with our idea of "narrowing down" constants to smallest possible integer. See
+            // #302 for some more details.
+            AddToken(new NumericToken(source[start..current], line, numberCharacters, isFractional, numberBase, numberStyles));
         }
 
         private static string RemoveUnderscores(string s)
@@ -604,9 +543,9 @@ private static bool IsUnderscore(char c) =>
             c is '_';
 
         private static bool IsAlphaNumeric(char c) =>
-            IsAlpha(c) || IsUnderscore(c) || IsDigit(c, Base.DECIMAL);
+            IsAlpha(c) || IsUnderscore(c) || IsDigit(c, NumericToken.Base.DECIMAL);
 
-        private static bool IsDigit(char c, Base @base) =>
+        private static bool IsDigit(char c, NumericToken.Base @base) =>
             (int)@base switch
             {
                 2 => c is '0' or '1',
@@ -631,12 +570,9 @@ private void AddToken(TokenType type, object literal = null)
             tokens.Add(new Token(type, text, literal, line));
         }
 
-        private enum Base
+        private void AddToken(Token token)
         {
-            BINARY = 2,
-            OCTAL = 8,
-            DECIMAL = 10,
-            HEXADECIMAL = 16,
+            tokens.Add(token);
         }
     }
 }