Implemented SNBT parser

2021-08-31 22:16:27 -04:00 · 2021-08-31 22:16:27 -04:00 · 92112ac887
parent 31aac628b6
commit 92112ac887
8 changed files with 433 additions and 177 deletions
--- a/SharpNBT.Tests/Data/bigtest.snbt
+++ b/SharpNBT.Tests/Data/bigtest.snbt
@ -0,0 +1,41 @@
+{
+  "test case": 90,
+  noQuotes: "HELLO WORLD THIS IS A TEST STRING ÅÄÖ!",
+  "test with \" escaped quote": 90,
+  'single quoted with inner "double quotes"': 90,
+  "double quoted with inner 'single quote' in text": -45.0f,
+  shortTest: 32767s,
+  longTest: 9223372036854775807L,
+  byteTest: 127b,
+  byteArrayTest: [B; 0B, 62B, 34B, 16B, 8B, 10B, 22B, 44B, 76B, 18B, 70B, 32B, 4B, 86B, 78B, 80B, 92B, 14B, 46B, 48B],
+  "listTest (long)": [
+    -11L,
+    12L,
+    13L,
+    -14L,
+    15L
+  ],
+  floatTest: 0.49823147f,
+  doubleTest: 0.4931287132182315d,
+  intTest: 2147483647,
+  "listTest (compound)": [
+    {
+      created-on: 1264099775885L,
+      name: "Compound tag #0"
+    },
+    {
+      created-on: 1264099775885L,
+      name: "Compound tag #1"
+    }
+  ],
+  "nested compound test": {
+    egg: {
+      name: "Eggbert",
+      value: 0.5f
+    },
+    ham: {
+      name: "Hampus",
+      value: 0.75f
+    }
+  }
+}
--- a/SharpNBT.Tests/SharpNBT.Tests.csproj
+++ b/SharpNBT.Tests/SharpNBT.Tests.csproj
@ -32,6 +32,8 @@
      <EmbeddedResource Include="Data\hello_world.nbt">
        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
      </EmbeddedResource>
+      <None Remove="Data\bigtest.snbt" />
+      <EmbeddedResource Include="Data\bigtest.snbt" />
    </ItemGroup>

 </Project>
--- a/SharpNBT.Tests/StringifiedTest.cs
+++ b/SharpNBT.Tests/StringifiedTest.cs
@ -1,4 +1,6 @@
 using System.IO;
+using System.Runtime.InteropServices;
+using System.Text;
 using Microsoft.VisualStudio.TestPlatform.ObjectModel;
 using SharpNBT.SNBT;
 using Xunit;
@ -34,22 +36,19 @@ namespace SharpNBT.Tests
        public void ParseSmall()
        {
            const string testString = "{name1:123,name2:\"sometext1\",name3:{subname1:456,subname2:\"sometext2\"}}";
-            var lexer = new Lexer();
-            foreach (var token in lexer.Tokenize(testString))
-            {
-                output.WriteLine($"{token.Type}: \"{token.Match.Trim()}\"");
-            }
+            var tag = StringNbt.Parse(testString);
+            output.WriteLine(tag.PrettyPrinted());
        }

        [Fact]
        public void ParseBig()
        {
-            var testString = File.ReadAllText("/code/ruby/craftbook-nbt/test/bigtest.snbt");
-            var lexer = new Lexer();
-            foreach (var token in lexer.Tokenize(testString))
-            {
-                output.WriteLine($"{token.Type}: \"{token.Match.Trim()}\"");
-            }
+            using var stream = TestHelper.GetFile("bigtest.snbt", CompressionType.None);
+            using var reader = new StreamReader(stream, Encoding.UTF8);
+            var testString = reader.ReadToEnd();
+
+            var tag = StringNbt.Parse(testString);
+            output.WriteLine(tag.PrettyPrinted());
        }
    }
 }
--- a/SharpNBT/SNBT/Lexer.cs
+++ b/SharpNBT/SNBT/Lexer.cs
@ -1,180 +1,50 @@
-using System;
 using System.Collections.Generic;
-using System.IO;
-using System.Reflection.Emit;
-using System.Security;
-using System.Text.RegularExpressions;
-using JetBrains.Annotations;
+using System.Data;

 namespace SharpNBT.SNBT
 {
-    internal enum TokenType
+    internal sealed class Lexer
    {
-        None,
-        CompoundBegin,
-        CompoundEnd,
-        Identifier,
-        String,
-        Separator,
-        Comma,
-        ByteArray,
-        IntArray,
-        LongArray,
-        ListArray,
-        EndArray,
-        Float,
-        Double,
-        Byte,
-        Short,
-        Long,
-        Int,
-        WhiteSpace,
-        Char,
-        EscapedChar
-    }
-
-
-    internal sealed class LexerRule
-    {
-
-        internal delegate string PostProcessHandler(Match match);
+        private readonly List<LexerRule> ruleList;
        
-        public Regex Matcher { get; }
-        
-        public TokenType Type { get; }
-        
-        public string Description { get; }
-
-        public string PostProcess(Match match) => handler?.Invoke(match) ?? match.Value;
-
-        private readonly PostProcessHandler handler;
-        
-        public LexerRule(TokenType type, string description, string pattern, [CanBeNull] PostProcessHandler process)
-        {
-            Type = type;
-            Description = description;
-            Matcher = new Regex(pattern, RegexOptions.Multiline | RegexOptions.CultureInvariant);
-            handler = process;
-        }
-
-        // public LexerRule(TokenType type, string description, Regex regex)
-        // {
-        //     Description = description;
-        //     Type = type;
-        //     
-        // }
-    }
-    
-    internal sealed class Token
-    {
-        public TokenType Type { get; }
-        
-        public string Match { get; }
-
-        public Token(TokenType type, string match)
-        {
-            Type = type;
-            Type = type;
-            Match = match;
-        }
-    }
-
-    internal class Lexer
-    {
-        private static readonly string DoubleQuoteIdentifier = "\"(.*?)\"\\s*(?=:)";
-
-        private static readonly List<LexerRule> rules;
-        
-
-        private const string IDENTIFIER_DOUBLE_QUOTES = "\".*?\"\\s*(?>:)";
-        private const string IDENTIFIER_SINGLE_QUOTES = "'.*?'\\s*(?>:)";
-        private const string IDENTIFIER_NO_QUOTES = @"[A-Za-z0-9_-]+\s*(?=:)";
-
-        private const string STRING_DOUBLE_QUOTED = "^\\s*\".*?\"";
-        private const string STRING_SINGLE_QUOTED = "^\\s*'.*?'";
-
-        private const string COMPOUND_START = "\\s*{\\s*";
-        private const string COMPOUND_END = @"\}";
-
-
-        private const string SEPARATOR = "^\\s*:\\s*";
-        private const string COMMA = "^\\s*,\\s*";
-        
-        
-        static Lexer()
-        {
-            rules = new List<LexerRule>
-            {
-                new LexerRule(TokenType.CompoundBegin, "Opening Compound brace", "^{", null),
-                new LexerRule(TokenType.WhiteSpace, "Useless whitespace", @"^[\s]+", null),
-                
-                new LexerRule(TokenType.Identifier, "Single-quoted name",  "^\\s*'(.*?)'\\s*(?=:)", m => m.Groups[1].Value),
-                new LexerRule(TokenType.Identifier, "Double-quoted name",  "^\\s*\"(.*?)\"\\s*(?=:)", m => m.Groups[1].Value),
-                new LexerRule(TokenType.Identifier, "Unquoted name",  "^\\s*([A-Za-z0-9_-]+)\\s*(?=:)", m => m.Groups[1].Value),
-                
-                
-                new LexerRule(TokenType.String, "Double-quoted string value", "^\"(.*?)\"", null),
-                new LexerRule(TokenType.String, "Single-quoted string value", "^'(.*?)'", null)
-
-                // new LexerRule(TokenType.CompoundBegin, COMPOUND_START),
-                // new LexerRule(TokenType.CompoundEnd, COMPOUND_END),
-                // new LexerRule(TokenType.Identifier, IDENTIFIER_DOUBLE_QUOTES),
-                // new LexerRule(TokenType.Identifier, IDENTIFIER_SINGLE_QUOTES),
-                // new LexerRule(TokenType.Identifier, IDENTIFIER_NO_QUOTES),
-                // new LexerRule(TokenType.String, STRING_DOUBLE_QUOTED),
-                // new LexerRule(TokenType.String, STRING_SINGLE_QUOTED),
-                // new LexerRule(TokenType.Separator, SEPARATOR),
-                // new LexerRule(TokenType.Comma, COMMA),
-                // new LexerRule(TokenType.ByteArray, @"\[B;[\s]*?"),
-                // new LexerRule(TokenType.IntArray, @"\[I;[\s]*?"),
-                // new LexerRule(TokenType.LongArray, @"\[L;[\s]*?"),
-                // new LexerRule(TokenType.ListArray, @"\[[\s]*?"),
-                // new LexerRule(TokenType.EndArray, @"[\s]*\]"),
-                // new LexerRule(TokenType.Float, @"-?[0-9]*\.[0-9]+[Ff]"),
-                // new LexerRule(TokenType.Double, @"-?[0-9]*\.[0-9]+[Dd]?"),
-                // new LexerRule(TokenType.Byte, "-?([0-9]+)[Bb]"),
-                // new LexerRule(TokenType.Short, "-?([0-9]+)[Ss]"),
-                // new LexerRule(TokenType.Long, "-?([0-9]+)[Ll]"),
-                // new LexerRule(TokenType.Int, "-?([0-9]+)"),
-                // new LexerRule(TokenType.WhiteSpace, @"[\s]+"),
-                // new LexerRule(TokenType.String, @"[\S]+"),
-                // new LexerRule(TokenType.Char, ".")
-            };
-        }
-
-        private static string Process(Match match)
-        {
-            throw new NotImplementedException();
-        }
-
        public Lexer()
        {
-            
+            ruleList = new List<LexerRule>();
        }
        
-        public IEnumerable<Token> Tokenize(string input)
-        {
-            string.Create(input.Length, input, (span, i) =>
-            {
-                
-            });
-            var pos = 0;
+        public void AddRule(TokenType type, string pattern, bool skipped = false) => ruleList.Add(new LexerRule(type, pattern, null, skipped));

-            do
-            {
-                Label:
-                foreach (var rule in rules)
-                {
-                    var match = rule.Matcher.Match(input, pos);
-                    if (match.Success)
-                    {
-                        yield return new Token(rule.Type, rule.PostProcess(match));
-                        pos = match.Index + match.Length - 1;
-                        break;
-                    }
-                }
-            } while (++pos < input.Length);
-            
+        public void AddRule(TokenType type, string pattern, ResultHandler handler, bool skipped = false)
+        {
+            ruleList.Add(new LexerRule(type, pattern, handler, skipped));
        }
+        
+        public IEnumerable<Token> Tokenize(string source)
+        {
+            var index = 0;
+            while (index < source.Length)
+            {
+                var success = false;
+
+                foreach (var rule in ruleList)
+                {
+                    var match = rule.Pattern.Match(source, index);
+                    if (!match.Success || match.Index - index != 0) 
+                        continue;
+                    
+                    if (!rule.IsSkipped)
+                        yield return new Token(rule.Type, rule.Process(source, index, match));
+
+                    index += match.Length;
+                    success = true;
+                    break;
+                }
+
+                if (!success)
+                    throw new SyntaxErrorException($"Unrecognized sequence at index {index}: '{source[index]}'");
+            }
+        }
+        
+        
    }
 }
--- a/SharpNBT/SNBT/LexerRule.cs
+++ b/SharpNBT/SNBT/LexerRule.cs
@ -0,0 +1,37 @@
+using System;
+using System.Text.RegularExpressions;
+
+namespace SharpNBT.SNBT
+{
+
+    internal delegate string ResultHandler(Match match);
+
+    internal class LexerRule
+    {
+        private readonly ResultHandler processResult;
+        
+        public TokenType Type { get; }
+        
+        public Regex Pattern { get; }
+        
+        public bool IsSkipped { get; }
+        
+        
+        public LexerRule(TokenType type, string pattern, bool skipped = false) : this(type, pattern, null, skipped)
+        {
+        }
+        
+        public LexerRule(TokenType type, string pattern, ResultHandler handler, bool skipped = false)
+        {
+            Type = type;
+            Pattern = new Regex(pattern);
+            IsSkipped = skipped;
+            processResult = handler;
+        }
+
+        public string Process(string source, int index, Match match)
+        {
+            return processResult is null ? source.Substring(index, match.Length) : processResult.Invoke(match);
+        }
+    }
+}
--- a/SharpNBT/SNBT/StringNbt.cs
+++ b/SharpNBT/SNBT/StringNbt.cs
@ -0,0 +1,174 @@
+using System;
+using System.Collections.Generic;
+using System.Data;
+using System.Text.RegularExpressions;
+using JetBrains.Annotations;
+
+namespace SharpNBT.SNBT
+{
+    public static class StringNbt
+    {
+        private static readonly Lexer lexer;
+
+        static StringNbt()
+        {
+            lexer = new Lexer();
+            lexer.AddRule(TokenType.Whitespace, @"(\r|\t|\v|\f|\s)+?", true);
+            lexer.AddRule(TokenType.Separator, ",", true);
+            lexer.AddRule(TokenType.Compound, @"{");
+            lexer.AddRule(TokenType.EndCompound, @"}");
+            lexer.AddRule(TokenType.Identifier, "\"(.*?)\"\\s*(?>:)", FirstGroupValue);
+            lexer.AddRule(TokenType.Identifier, "'(.*?)'\\s*(?>:)", FirstGroupValue);
+            lexer.AddRule(TokenType.Identifier, "([A-Za-z0-9_-]+)\\s*(?>:)", FirstGroupValue);
+            lexer.AddRule(TokenType.String, "\"(.*?)\"", FirstGroupValue);
+            lexer.AddRule(TokenType.String, "'(.*?)'", FirstGroupValue);
+            lexer.AddRule(TokenType.ByteArray, @"\[B;");
+            lexer.AddRule(TokenType.IntArray, @"\[I;");
+            lexer.AddRule(TokenType.LongArray, @"\[L;");
+            lexer.AddRule(TokenType.List, @"\[");
+            lexer.AddRule(TokenType.EndArray, @"\]");
+            lexer.AddRule(TokenType.Float, @"(-?[0-9]*\.[0-9]+)[Ff]", FirstGroupValue);
+            lexer.AddRule(TokenType.Double, @"(-?[0-9]*\.[0-9]+)[Dd]?", FirstGroupValue);
+            lexer.AddRule(TokenType.Byte, "(-?[0-9]+)[Bb]", FirstGroupValue);
+            lexer.AddRule(TokenType.Short, "(-?[0-9]+)[Ss]", FirstGroupValue);
+            lexer.AddRule(TokenType.Long, "(-?[0-9]+)[Ll]", FirstGroupValue);
+            lexer.AddRule(TokenType.Int, "(-?[0-9]+)", FirstGroupValue);
+        }
+        
+        private static string FirstGroupValue(Match match) => match.Groups[1].Value;
+        
+        public static CompoundTag Parse([NotNull] string source)
+        {
+            if (source is null)
+                throw new ArgumentNullException(nameof(source));
+
+            if (string.IsNullOrWhiteSpace(source))
+                return new CompoundTag(null);
+
+            var queue = new Queue<Token>(lexer.Tokenize(source));
+            return Parse<CompoundTag>(queue);
+        }
+
+        private static T Parse<T>(Queue<Token> queue) where T : Tag => (T)Parse(queue);
+        
+        private static Tag Parse(Queue<Token> queue)
+        {
+            string name = null;
+            var token = MoveNext(queue);
+
+            if (token.Type == TokenType.Identifier)
+            {
+                name = token.Value;
+                token = MoveNext(queue);
+            }
+
+            return token.Type switch
+            {
+                TokenType.Compound => ParseCompound(name, queue),
+                TokenType.String => new StringTag(name, token.Value),
+                TokenType.ByteArray => ParseByteArray(name, queue),
+                TokenType.IntArray => ParseIntArray(name, queue),
+                TokenType.LongArray => ParseLongArray(name, queue),
+                TokenType.List => ParseList(name, queue),
+                TokenType.Byte => new ByteTag(name, sbyte.Parse(token.Value)),
+                TokenType.Short => new ShortTag(name, short.Parse(token.Value)),
+                TokenType.Int => new IntTag(name, int.Parse(token.Value)),
+                TokenType.Long => new LongTag(name, long.Parse(token.Value)),
+                TokenType.Float => new FloatTag(name, float.Parse(token.Value)),
+                TokenType.Double => new DoubleTag(name, double.Parse(token.Value)),
+                _ => throw new SyntaxErrorException()
+            };
+        }
+        
+        [NotNull]
+        private static Token MoveNext(Queue<Token> queue)
+        {
+            if (queue.TryDequeue(out var token))
+                return token;
+            
+            throw new SyntaxErrorException("Unexpected end-of-input");
+        }
+        
+        private static void MoveNext(Queue<Token> queue, TokenType assertType)
+        {
+            var token = MoveNext(queue);
+            if (token.Type != assertType)
+                throw new SyntaxErrorException($"Expected token of type {assertType}, but encountered {token.Type}.");
+        }
+
+        private static CompoundTag ParseCompound(string name, Queue<Token> queue)
+        {
+            var compound = new CompoundTag(name);
+            while (queue.TryPeek(out var token) && token.Type != TokenType.EndCompound)
+            {
+                compound.Add(Parse(queue));
+            }
+            MoveNext(queue, TokenType.EndCompound);
+            return compound;
+        }
+        
+        private static ListTag ParseList(string name, Queue<Token> queue)
+        {
+            var values = new List<Tag>();
+            while (queue.TryPeek(out var token) && token.Type != TokenType.EndArray)
+            {
+                values.Add(Parse(queue));
+            }
+
+            MoveNext(queue, TokenType.EndArray);
+            if (values.Count > 0)
+            {
+                var type = values[0].Type;
+                return new ListTag(name, type, values);
+            }
+            return new ListTag(name, TagType.End);
+        }
+        
+        private static ByteArrayTag ParseByteArray(string name, Queue<Token> queue)
+        {
+            var values = new List<byte>();
+            foreach (var token in DequeueUntil(queue, TokenType.EndArray))
+            {
+                if (token.Type != TokenType.Byte)
+                    throw new SyntaxErrorException($"Invalid token type in array, expected {TokenType.Byte}, got {token.Type}.");
+                values.Add(unchecked((byte) sbyte.Parse(token.Value)));
+            }
+            return new ByteArrayTag(name, values);
+        }
+
+        private static IntArrayTag ParseIntArray(string name, Queue<Token> queue)
+        {
+            var values = new List<int>();
+            foreach (var token in DequeueUntil(queue, TokenType.EndArray))
+            {
+                if (token.Type != TokenType.Int)
+                    throw new SyntaxErrorException($"Invalid token type in array, expected {TokenType.Int}, got {token.Type}.");
+                values.Add(int.Parse(token.Value));
+            }
+            return new IntArrayTag(name, values);
+        }
+        
+        private static LongArrayTag ParseLongArray(string name, Queue<Token> queue)
+        {
+            var values = new List<long>();
+            foreach (var token in DequeueUntil(queue, TokenType.EndArray))
+            {
+                if (token.Type != TokenType.Long)
+                    throw new SyntaxErrorException($"Invalid token type in array, expected {TokenType.Long}, got {token.Type}.");
+                values.Add(long.Parse(token.Value));
+            }
+            return new LongArrayTag(name, values);
+        }
+
+        private static IEnumerable<Token> DequeueUntil(Queue<Token> queue, TokenType type)
+        {
+            while (true)
+            {
+                var token = MoveNext(queue);
+                if (token.Type == type)
+                    yield break;
+                yield return token;
+            }
+        }
+    }
+}
--- a/SharpNBT/SNBT/Token.cs
+++ b/SharpNBT/SNBT/Token.cs
@ -0,0 +1,35 @@
+using JetBrains.Annotations;
+
+namespace SharpNBT.SNBT
+{
+    /// <summary>
+    /// An object emitted by the lexer to describe a logical fragment of code that can be parsed.
+    /// </summary>
+    [PublicAPI]
+    public sealed class Token
+    {
+        /// <summary>
+        /// Gets a value describing the general type code fragment this <see cref="Token"/> represents.
+        /// </summary>
+        public TokenType Type { get; }
+
+        /// <summary>
+        /// Gets a value of this fragment, which can vary depending on context and the <see cref="Type"/>.
+        /// </summary>
+        public string Value { get; }
+
+        /// <summary>
+        /// Creates a new instance of the <see cref="Token"/> class.
+        /// </summary>
+        /// <param name="type">A value describing the general type code fragment this <see cref="Token"/> represents.</param>
+        /// <param name="value">Ahe value of this code fragment.</param>
+        public Token(TokenType type,  [NotNull] string value)
+        {
+            Type = type;
+            Value = value;
+        }
+
+        /// <inheritdoc />
+        public override string ToString() => $"[{Type}] \"{Value}\"";
+    }
+}
--- a/SharpNBT/SNBT/TokenType.cs
+++ b/SharpNBT/SNBT/TokenType.cs
@ -0,0 +1,98 @@
+using JetBrains.Annotations;
+
+namespace SharpNBT.SNBT
+{
+    /// <summary>
+    /// Describes types of tokens that the SNBT lexer can emit.
+    /// </summary>
+    [PublicAPI]
+    public enum TokenType
+    {
+        /// <summary>
+        /// Any whitespace/newline not found within a string or identifier.
+        /// </summary>
+        /// <remarks>This type is not yielded during tokenization.</remarks>
+        Whitespace,
+        
+        /// <summary>
+        /// A separator between objects and array elements.
+        /// </summary>
+        /// <remarks>This type is not yielded during tokenization.</remarks>
+        Separator,
+        
+        /// <summary>
+        /// The beginning of new <see cref="CompoundTag"/> object.
+        /// </summary>
+        Compound,
+        
+        /// <summary>
+        /// The end of a <see cref="CompoundTag"/>.
+        /// </summary>
+        EndCompound,
+        
+        /// <summary>
+        /// The name of an tag.
+        /// </summary>
+        Identifier,
+        
+        /// <summary>
+        /// A <see cref="StringTag"/> value, which may be contain escaped quotes.
+        /// </summary>
+        String,
+        
+        /// <summary>
+        /// The beginning of a <see cref="ByteArrayTag"/>.
+        /// </summary>
+        ByteArray,
+        
+        /// <summary>
+        /// The beginning of a <see cref="IntArrayTag"/>.
+        /// </summary>
+        IntArray,
+        
+        /// <summary>
+        /// The beginning of a <see cref="LongArrayTag"/>.
+        /// </summary>
+        LongArray,
+        
+        /// <summary>
+        /// The beginning of a <see cref="ListTag"/>.
+        /// </summary>
+        List,
+        
+        /// <summary>
+        /// The end of a <see cref="ByteArrayTag"/>, <see cref="IntArrayTag"/>, <see cref="LongArrayTag"/> or <see cref="ListTag"/>.
+        /// </summary>
+        EndArray,
+        
+        /// <summary>
+        /// A <see cref="ByteTag"/> value or element of a <see cref="ByteArrayTag"/> depending on context.
+        /// </summary>
+        Byte,
+        
+        /// <summary>
+        /// A <see cref="ShortTag"/> value.
+        /// </summary>
+        Short,
+        
+        /// <summary>
+        /// A <see cref="IntTag"/> value or element of a <see cref="IntArrayTag"/> depending on context.
+        /// </summary>
+        Int,
+        
+        /// <summary>
+        /// A <see cref="LongTag"/> value or element of a <see cref="LongArrayTag"/> depending on context.
+        /// </summary>
+        Long,
+        
+        /// <summary>
+        /// A <see cref="FloatTag"/> value.
+        /// </summary>
+        Float,
+        
+        /// <summary>
+        /// A <see cref="DoubleTag"/> value.
+        /// </summary>
+        Double
+    }
+}