diff --git a/SharpNBT.Tests/Data/bigtest.snbt b/SharpNBT.Tests/Data/bigtest.snbt new file mode 100644 index 0000000..e4a523d --- /dev/null +++ b/SharpNBT.Tests/Data/bigtest.snbt @@ -0,0 +1,41 @@ +{ + "test case": 90, + noQuotes: "HELLO WORLD THIS IS A TEST STRING ÅÄÖ!", + "test with \" escaped quote": 90, + 'single quoted with inner "double quotes"': 90, + "double quoted with inner 'single quote' in text": -45.0f, + shortTest: 32767s, + longTest: 9223372036854775807L, + byteTest: 127b, + byteArrayTest: [B; 0B, 62B, 34B, 16B, 8B, 10B, 22B, 44B, 76B, 18B, 70B, 32B, 4B, 86B, 78B, 80B, 92B, 14B, 46B, 48B], + "listTest (long)": [ + -11L, + 12L, + 13L, + -14L, + 15L + ], + floatTest: 0.49823147f, + doubleTest: 0.4931287132182315d, + intTest: 2147483647, + "listTest (compound)": [ + { + created-on: 1264099775885L, + name: "Compound tag #0" + }, + { + created-on: 1264099775885L, + name: "Compound tag #1" + } + ], + "nested compound test": { + egg: { + name: "Eggbert", + value: 0.5f + }, + ham: { + name: "Hampus", + value: 0.75f + } + } +} \ No newline at end of file diff --git a/SharpNBT.Tests/SharpNBT.Tests.csproj b/SharpNBT.Tests/SharpNBT.Tests.csproj index f8b48cf..15aafb5 100644 --- a/SharpNBT.Tests/SharpNBT.Tests.csproj +++ b/SharpNBT.Tests/SharpNBT.Tests.csproj @@ -32,6 +32,8 @@ PreserveNewest + + diff --git a/SharpNBT.Tests/StringifiedTest.cs b/SharpNBT.Tests/StringifiedTest.cs index 05b7adc..8b60ab9 100644 --- a/SharpNBT.Tests/StringifiedTest.cs +++ b/SharpNBT.Tests/StringifiedTest.cs @@ -1,4 +1,6 @@ using System.IO; +using System.Runtime.InteropServices; +using System.Text; using Microsoft.VisualStudio.TestPlatform.ObjectModel; using SharpNBT.SNBT; using Xunit; @@ -34,22 +36,19 @@ namespace SharpNBT.Tests public void ParseSmall() { const string testString = "{name1:123,name2:\"sometext1\",name3:{subname1:456,subname2:\"sometext2\"}}"; - var lexer = new Lexer(); - foreach (var token in lexer.Tokenize(testString)) - { - output.WriteLine($"{token.Type}: \"{token.Match.Trim()}\""); - } + var tag = StringNbt.Parse(testString); + output.WriteLine(tag.PrettyPrinted()); } [Fact] public void ParseBig() { - var testString = File.ReadAllText("/code/ruby/craftbook-nbt/test/bigtest.snbt"); - var lexer = new Lexer(); - foreach (var token in lexer.Tokenize(testString)) - { - output.WriteLine($"{token.Type}: \"{token.Match.Trim()}\""); - } + using var stream = TestHelper.GetFile("bigtest.snbt", CompressionType.None); + using var reader = new StreamReader(stream, Encoding.UTF8); + var testString = reader.ReadToEnd(); + + var tag = StringNbt.Parse(testString); + output.WriteLine(tag.PrettyPrinted()); } } } \ No newline at end of file diff --git a/SharpNBT/SNBT/Lexer.cs b/SharpNBT/SNBT/Lexer.cs index 7f620f9..03fb767 100644 --- a/SharpNBT/SNBT/Lexer.cs +++ b/SharpNBT/SNBT/Lexer.cs @@ -1,180 +1,50 @@ -using System; using System.Collections.Generic; -using System.IO; -using System.Reflection.Emit; -using System.Security; -using System.Text.RegularExpressions; -using JetBrains.Annotations; +using System.Data; namespace SharpNBT.SNBT { - internal enum TokenType + internal sealed class Lexer { - None, - CompoundBegin, - CompoundEnd, - Identifier, - String, - Separator, - Comma, - ByteArray, - IntArray, - LongArray, - ListArray, - EndArray, - Float, - Double, - Byte, - Short, - Long, - Int, - WhiteSpace, - Char, - EscapedChar - } - - - internal sealed class LexerRule - { - - internal delegate string PostProcessHandler(Match match); + private readonly List ruleList; - public Regex Matcher { get; } - - public TokenType Type { get; } - - public string Description { get; } - - public string PostProcess(Match match) => handler?.Invoke(match) ?? match.Value; - - private readonly PostProcessHandler handler; - - public LexerRule(TokenType type, string description, string pattern, [CanBeNull] PostProcessHandler process) - { - Type = type; - Description = description; - Matcher = new Regex(pattern, RegexOptions.Multiline | RegexOptions.CultureInvariant); - handler = process; - } - - // public LexerRule(TokenType type, string description, Regex regex) - // { - // Description = description; - // Type = type; - // - // } - } - - internal sealed class Token - { - public TokenType Type { get; } - - public string Match { get; } - - public Token(TokenType type, string match) - { - Type = type; - Type = type; - Match = match; - } - } - - internal class Lexer - { - private static readonly string DoubleQuoteIdentifier = "\"(.*?)\"\\s*(?=:)"; - - private static readonly List rules; - - - private const string IDENTIFIER_DOUBLE_QUOTES = "\".*?\"\\s*(?>:)"; - private const string IDENTIFIER_SINGLE_QUOTES = "'.*?'\\s*(?>:)"; - private const string IDENTIFIER_NO_QUOTES = @"[A-Za-z0-9_-]+\s*(?=:)"; - - private const string STRING_DOUBLE_QUOTED = "^\\s*\".*?\""; - private const string STRING_SINGLE_QUOTED = "^\\s*'.*?'"; - - private const string COMPOUND_START = "\\s*{\\s*"; - private const string COMPOUND_END = @"\}"; - - - private const string SEPARATOR = "^\\s*:\\s*"; - private const string COMMA = "^\\s*,\\s*"; - - - static Lexer() - { - rules = new List - { - new LexerRule(TokenType.CompoundBegin, "Opening Compound brace", "^{", null), - new LexerRule(TokenType.WhiteSpace, "Useless whitespace", @"^[\s]+", null), - - new LexerRule(TokenType.Identifier, "Single-quoted name", "^\\s*'(.*?)'\\s*(?=:)", m => m.Groups[1].Value), - new LexerRule(TokenType.Identifier, "Double-quoted name", "^\\s*\"(.*?)\"\\s*(?=:)", m => m.Groups[1].Value), - new LexerRule(TokenType.Identifier, "Unquoted name", "^\\s*([A-Za-z0-9_-]+)\\s*(?=:)", m => m.Groups[1].Value), - - - new LexerRule(TokenType.String, "Double-quoted string value", "^\"(.*?)\"", null), - new LexerRule(TokenType.String, "Single-quoted string value", "^'(.*?)'", null) - - // new LexerRule(TokenType.CompoundBegin, COMPOUND_START), - // new LexerRule(TokenType.CompoundEnd, COMPOUND_END), - // new LexerRule(TokenType.Identifier, IDENTIFIER_DOUBLE_QUOTES), - // new LexerRule(TokenType.Identifier, IDENTIFIER_SINGLE_QUOTES), - // new LexerRule(TokenType.Identifier, IDENTIFIER_NO_QUOTES), - // new LexerRule(TokenType.String, STRING_DOUBLE_QUOTED), - // new LexerRule(TokenType.String, STRING_SINGLE_QUOTED), - // new LexerRule(TokenType.Separator, SEPARATOR), - // new LexerRule(TokenType.Comma, COMMA), - // new LexerRule(TokenType.ByteArray, @"\[B;[\s]*?"), - // new LexerRule(TokenType.IntArray, @"\[I;[\s]*?"), - // new LexerRule(TokenType.LongArray, @"\[L;[\s]*?"), - // new LexerRule(TokenType.ListArray, @"\[[\s]*?"), - // new LexerRule(TokenType.EndArray, @"[\s]*\]"), - // new LexerRule(TokenType.Float, @"-?[0-9]*\.[0-9]+[Ff]"), - // new LexerRule(TokenType.Double, @"-?[0-9]*\.[0-9]+[Dd]?"), - // new LexerRule(TokenType.Byte, "-?([0-9]+)[Bb]"), - // new LexerRule(TokenType.Short, "-?([0-9]+)[Ss]"), - // new LexerRule(TokenType.Long, "-?([0-9]+)[Ll]"), - // new LexerRule(TokenType.Int, "-?([0-9]+)"), - // new LexerRule(TokenType.WhiteSpace, @"[\s]+"), - // new LexerRule(TokenType.String, @"[\S]+"), - // new LexerRule(TokenType.Char, ".") - }; - } - - private static string Process(Match match) - { - throw new NotImplementedException(); - } - public Lexer() { - + ruleList = new List(); } - public IEnumerable Tokenize(string input) - { - string.Create(input.Length, input, (span, i) => - { - - }); - var pos = 0; + public void AddRule(TokenType type, string pattern, bool skipped = false) => ruleList.Add(new LexerRule(type, pattern, null, skipped)); - do - { - Label: - foreach (var rule in rules) - { - var match = rule.Matcher.Match(input, pos); - if (match.Success) - { - yield return new Token(rule.Type, rule.PostProcess(match)); - pos = match.Index + match.Length - 1; - break; - } - } - } while (++pos < input.Length); - + public void AddRule(TokenType type, string pattern, ResultHandler handler, bool skipped = false) + { + ruleList.Add(new LexerRule(type, pattern, handler, skipped)); } + + public IEnumerable Tokenize(string source) + { + var index = 0; + while (index < source.Length) + { + var success = false; + + foreach (var rule in ruleList) + { + var match = rule.Pattern.Match(source, index); + if (!match.Success || match.Index - index != 0) + continue; + + if (!rule.IsSkipped) + yield return new Token(rule.Type, rule.Process(source, index, match)); + + index += match.Length; + success = true; + break; + } + + if (!success) + throw new SyntaxErrorException($"Unrecognized sequence at index {index}: '{source[index]}'"); + } + } + + } } \ No newline at end of file diff --git a/SharpNBT/SNBT/LexerRule.cs b/SharpNBT/SNBT/LexerRule.cs new file mode 100644 index 0000000..68e21c3 --- /dev/null +++ b/SharpNBT/SNBT/LexerRule.cs @@ -0,0 +1,37 @@ +using System; +using System.Text.RegularExpressions; + +namespace SharpNBT.SNBT +{ + + internal delegate string ResultHandler(Match match); + + internal class LexerRule + { + private readonly ResultHandler processResult; + + public TokenType Type { get; } + + public Regex Pattern { get; } + + public bool IsSkipped { get; } + + + public LexerRule(TokenType type, string pattern, bool skipped = false) : this(type, pattern, null, skipped) + { + } + + public LexerRule(TokenType type, string pattern, ResultHandler handler, bool skipped = false) + { + Type = type; + Pattern = new Regex(pattern); + IsSkipped = skipped; + processResult = handler; + } + + public string Process(string source, int index, Match match) + { + return processResult is null ? source.Substring(index, match.Length) : processResult.Invoke(match); + } + } +} \ No newline at end of file diff --git a/SharpNBT/SNBT/StringNbt.cs b/SharpNBT/SNBT/StringNbt.cs new file mode 100644 index 0000000..53e77ba --- /dev/null +++ b/SharpNBT/SNBT/StringNbt.cs @@ -0,0 +1,174 @@ +using System; +using System.Collections.Generic; +using System.Data; +using System.Text.RegularExpressions; +using JetBrains.Annotations; + +namespace SharpNBT.SNBT +{ + public static class StringNbt + { + private static readonly Lexer lexer; + + static StringNbt() + { + lexer = new Lexer(); + lexer.AddRule(TokenType.Whitespace, @"(\r|\t|\v|\f|\s)+?", true); + lexer.AddRule(TokenType.Separator, ",", true); + lexer.AddRule(TokenType.Compound, @"{"); + lexer.AddRule(TokenType.EndCompound, @"}"); + lexer.AddRule(TokenType.Identifier, "\"(.*?)\"\\s*(?>:)", FirstGroupValue); + lexer.AddRule(TokenType.Identifier, "'(.*?)'\\s*(?>:)", FirstGroupValue); + lexer.AddRule(TokenType.Identifier, "([A-Za-z0-9_-]+)\\s*(?>:)", FirstGroupValue); + lexer.AddRule(TokenType.String, "\"(.*?)\"", FirstGroupValue); + lexer.AddRule(TokenType.String, "'(.*?)'", FirstGroupValue); + lexer.AddRule(TokenType.ByteArray, @"\[B;"); + lexer.AddRule(TokenType.IntArray, @"\[I;"); + lexer.AddRule(TokenType.LongArray, @"\[L;"); + lexer.AddRule(TokenType.List, @"\["); + lexer.AddRule(TokenType.EndArray, @"\]"); + lexer.AddRule(TokenType.Float, @"(-?[0-9]*\.[0-9]+)[Ff]", FirstGroupValue); + lexer.AddRule(TokenType.Double, @"(-?[0-9]*\.[0-9]+)[Dd]?", FirstGroupValue); + lexer.AddRule(TokenType.Byte, "(-?[0-9]+)[Bb]", FirstGroupValue); + lexer.AddRule(TokenType.Short, "(-?[0-9]+)[Ss]", FirstGroupValue); + lexer.AddRule(TokenType.Long, "(-?[0-9]+)[Ll]", FirstGroupValue); + lexer.AddRule(TokenType.Int, "(-?[0-9]+)", FirstGroupValue); + } + + private static string FirstGroupValue(Match match) => match.Groups[1].Value; + + public static CompoundTag Parse([NotNull] string source) + { + if (source is null) + throw new ArgumentNullException(nameof(source)); + + if (string.IsNullOrWhiteSpace(source)) + return new CompoundTag(null); + + var queue = new Queue(lexer.Tokenize(source)); + return Parse(queue); + } + + private static T Parse(Queue queue) where T : Tag => (T)Parse(queue); + + private static Tag Parse(Queue queue) + { + string name = null; + var token = MoveNext(queue); + + if (token.Type == TokenType.Identifier) + { + name = token.Value; + token = MoveNext(queue); + } + + return token.Type switch + { + TokenType.Compound => ParseCompound(name, queue), + TokenType.String => new StringTag(name, token.Value), + TokenType.ByteArray => ParseByteArray(name, queue), + TokenType.IntArray => ParseIntArray(name, queue), + TokenType.LongArray => ParseLongArray(name, queue), + TokenType.List => ParseList(name, queue), + TokenType.Byte => new ByteTag(name, sbyte.Parse(token.Value)), + TokenType.Short => new ShortTag(name, short.Parse(token.Value)), + TokenType.Int => new IntTag(name, int.Parse(token.Value)), + TokenType.Long => new LongTag(name, long.Parse(token.Value)), + TokenType.Float => new FloatTag(name, float.Parse(token.Value)), + TokenType.Double => new DoubleTag(name, double.Parse(token.Value)), + _ => throw new SyntaxErrorException() + }; + } + + [NotNull] + private static Token MoveNext(Queue queue) + { + if (queue.TryDequeue(out var token)) + return token; + + throw new SyntaxErrorException("Unexpected end-of-input"); + } + + private static void MoveNext(Queue queue, TokenType assertType) + { + var token = MoveNext(queue); + if (token.Type != assertType) + throw new SyntaxErrorException($"Expected token of type {assertType}, but encountered {token.Type}."); + } + + private static CompoundTag ParseCompound(string name, Queue queue) + { + var compound = new CompoundTag(name); + while (queue.TryPeek(out var token) && token.Type != TokenType.EndCompound) + { + compound.Add(Parse(queue)); + } + MoveNext(queue, TokenType.EndCompound); + return compound; + } + + private static ListTag ParseList(string name, Queue queue) + { + var values = new List(); + while (queue.TryPeek(out var token) && token.Type != TokenType.EndArray) + { + values.Add(Parse(queue)); + } + + MoveNext(queue, TokenType.EndArray); + if (values.Count > 0) + { + var type = values[0].Type; + return new ListTag(name, type, values); + } + return new ListTag(name, TagType.End); + } + + private static ByteArrayTag ParseByteArray(string name, Queue queue) + { + var values = new List(); + foreach (var token in DequeueUntil(queue, TokenType.EndArray)) + { + if (token.Type != TokenType.Byte) + throw new SyntaxErrorException($"Invalid token type in array, expected {TokenType.Byte}, got {token.Type}."); + values.Add(unchecked((byte) sbyte.Parse(token.Value))); + } + return new ByteArrayTag(name, values); + } + + private static IntArrayTag ParseIntArray(string name, Queue queue) + { + var values = new List(); + foreach (var token in DequeueUntil(queue, TokenType.EndArray)) + { + if (token.Type != TokenType.Int) + throw new SyntaxErrorException($"Invalid token type in array, expected {TokenType.Int}, got {token.Type}."); + values.Add(int.Parse(token.Value)); + } + return new IntArrayTag(name, values); + } + + private static LongArrayTag ParseLongArray(string name, Queue queue) + { + var values = new List(); + foreach (var token in DequeueUntil(queue, TokenType.EndArray)) + { + if (token.Type != TokenType.Long) + throw new SyntaxErrorException($"Invalid token type in array, expected {TokenType.Long}, got {token.Type}."); + values.Add(long.Parse(token.Value)); + } + return new LongArrayTag(name, values); + } + + private static IEnumerable DequeueUntil(Queue queue, TokenType type) + { + while (true) + { + var token = MoveNext(queue); + if (token.Type == type) + yield break; + yield return token; + } + } + } +} \ No newline at end of file diff --git a/SharpNBT/SNBT/Token.cs b/SharpNBT/SNBT/Token.cs new file mode 100644 index 0000000..c915f5d --- /dev/null +++ b/SharpNBT/SNBT/Token.cs @@ -0,0 +1,35 @@ +using JetBrains.Annotations; + +namespace SharpNBT.SNBT +{ + /// + /// An object emitted by the lexer to describe a logical fragment of code that can be parsed. + /// + [PublicAPI] + public sealed class Token + { + /// + /// Gets a value describing the general type code fragment this represents. + /// + public TokenType Type { get; } + + /// + /// Gets a value of this fragment, which can vary depending on context and the . + /// + public string Value { get; } + + /// + /// Creates a new instance of the class. + /// + /// A value describing the general type code fragment this represents. + /// Ahe value of this code fragment. + public Token(TokenType type, [NotNull] string value) + { + Type = type; + Value = value; + } + + /// + public override string ToString() => $"[{Type}] \"{Value}\""; + } +} \ No newline at end of file diff --git a/SharpNBT/SNBT/TokenType.cs b/SharpNBT/SNBT/TokenType.cs new file mode 100644 index 0000000..1d08a68 --- /dev/null +++ b/SharpNBT/SNBT/TokenType.cs @@ -0,0 +1,98 @@ +using JetBrains.Annotations; + +namespace SharpNBT.SNBT +{ + /// + /// Describes types of tokens that the SNBT lexer can emit. + /// + [PublicAPI] + public enum TokenType + { + /// + /// Any whitespace/newline not found within a string or identifier. + /// + /// This type is not yielded during tokenization. + Whitespace, + + /// + /// A separator between objects and array elements. + /// + /// This type is not yielded during tokenization. + Separator, + + /// + /// The beginning of new object. + /// + Compound, + + /// + /// The end of a . + /// + EndCompound, + + /// + /// The name of an tag. + /// + Identifier, + + /// + /// A value, which may be contain escaped quotes. + /// + String, + + /// + /// The beginning of a . + /// + ByteArray, + + /// + /// The beginning of a . + /// + IntArray, + + /// + /// The beginning of a . + /// + LongArray, + + /// + /// The beginning of a . + /// + List, + + /// + /// The end of a , , or . + /// + EndArray, + + /// + /// A value or element of a depending on context. + /// + Byte, + + /// + /// A value. + /// + Short, + + /// + /// A value or element of a depending on context. + /// + Int, + + /// + /// A value or element of a depending on context. + /// + Long, + + /// + /// A value. + /// + Float, + + /// + /// A value. + /// + Double + } +} \ No newline at end of file