Implemented SNBT parser

This commit is contained in:
ForeverZer0 2021-08-31 22:16:27 -04:00
parent 31aac628b6
commit 92112ac887
8 changed files with 433 additions and 177 deletions

View File

@ -0,0 +1,41 @@
{
"test case": 90,
noQuotes: "HELLO WORLD THIS IS A TEST STRING ÅÄÖ!",
"test with \" escaped quote": 90,
'single quoted with inner "double quotes"': 90,
"double quoted with inner 'single quote' in text": -45.0f,
shortTest: 32767s,
longTest: 9223372036854775807L,
byteTest: 127b,
byteArrayTest: [B; 0B, 62B, 34B, 16B, 8B, 10B, 22B, 44B, 76B, 18B, 70B, 32B, 4B, 86B, 78B, 80B, 92B, 14B, 46B, 48B],
"listTest (long)": [
-11L,
12L,
13L,
-14L,
15L
],
floatTest: 0.49823147f,
doubleTest: 0.4931287132182315d,
intTest: 2147483647,
"listTest (compound)": [
{
created-on: 1264099775885L,
name: "Compound tag #0"
},
{
created-on: 1264099775885L,
name: "Compound tag #1"
}
],
"nested compound test": {
egg: {
name: "Eggbert",
value: 0.5f
},
ham: {
name: "Hampus",
value: 0.75f
}
}
}

View File

@ -32,6 +32,8 @@
<EmbeddedResource Include="Data\hello_world.nbt">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</EmbeddedResource>
<None Remove="Data\bigtest.snbt" />
<EmbeddedResource Include="Data\bigtest.snbt" />
</ItemGroup>
</Project>

View File

@ -1,4 +1,6 @@
using System.IO;
using System.Runtime.InteropServices;
using System.Text;
using Microsoft.VisualStudio.TestPlatform.ObjectModel;
using SharpNBT.SNBT;
using Xunit;
@ -34,22 +36,19 @@ namespace SharpNBT.Tests
public void ParseSmall()
{
const string testString = "{name1:123,name2:\"sometext1\",name3:{subname1:456,subname2:\"sometext2\"}}";
var lexer = new Lexer();
foreach (var token in lexer.Tokenize(testString))
{
output.WriteLine($"{token.Type}: \"{token.Match.Trim()}\"");
}
var tag = StringNbt.Parse(testString);
output.WriteLine(tag.PrettyPrinted());
}
[Fact]
public void ParseBig()
{
var testString = File.ReadAllText("/code/ruby/craftbook-nbt/test/bigtest.snbt");
var lexer = new Lexer();
foreach (var token in lexer.Tokenize(testString))
{
output.WriteLine($"{token.Type}: \"{token.Match.Trim()}\"");
}
using var stream = TestHelper.GetFile("bigtest.snbt", CompressionType.None);
using var reader = new StreamReader(stream, Encoding.UTF8);
var testString = reader.ReadToEnd();
var tag = StringNbt.Parse(testString);
output.WriteLine(tag.PrettyPrinted());
}
}
}

View File

@ -1,180 +1,50 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Reflection.Emit;
using System.Security;
using System.Text.RegularExpressions;
using JetBrains.Annotations;
using System.Data;
namespace SharpNBT.SNBT
{
internal enum TokenType
internal sealed class Lexer
{
None,
CompoundBegin,
CompoundEnd,
Identifier,
String,
Separator,
Comma,
ByteArray,
IntArray,
LongArray,
ListArray,
EndArray,
Float,
Double,
Byte,
Short,
Long,
Int,
WhiteSpace,
Char,
EscapedChar
}
internal sealed class LexerRule
{
internal delegate string PostProcessHandler(Match match);
private readonly List<LexerRule> ruleList;
public Regex Matcher { get; }
public TokenType Type { get; }
public string Description { get; }
public string PostProcess(Match match) => handler?.Invoke(match) ?? match.Value;
private readonly PostProcessHandler handler;
public LexerRule(TokenType type, string description, string pattern, [CanBeNull] PostProcessHandler process)
{
Type = type;
Description = description;
Matcher = new Regex(pattern, RegexOptions.Multiline | RegexOptions.CultureInvariant);
handler = process;
}
// public LexerRule(TokenType type, string description, Regex regex)
// {
// Description = description;
// Type = type;
//
// }
}
internal sealed class Token
{
public TokenType Type { get; }
public string Match { get; }
public Token(TokenType type, string match)
{
Type = type;
Type = type;
Match = match;
}
}
internal class Lexer
{
private static readonly string DoubleQuoteIdentifier = "\"(.*?)\"\\s*(?=:)";
private static readonly List<LexerRule> rules;
private const string IDENTIFIER_DOUBLE_QUOTES = "\".*?\"\\s*(?>:)";
private const string IDENTIFIER_SINGLE_QUOTES = "'.*?'\\s*(?>:)";
private const string IDENTIFIER_NO_QUOTES = @"[A-Za-z0-9_-]+\s*(?=:)";
private const string STRING_DOUBLE_QUOTED = "^\\s*\".*?\"";
private const string STRING_SINGLE_QUOTED = "^\\s*'.*?'";
private const string COMPOUND_START = "\\s*{\\s*";
private const string COMPOUND_END = @"\}";
private const string SEPARATOR = "^\\s*:\\s*";
private const string COMMA = "^\\s*,\\s*";
static Lexer()
{
rules = new List<LexerRule>
{
new LexerRule(TokenType.CompoundBegin, "Opening Compound brace", "^{", null),
new LexerRule(TokenType.WhiteSpace, "Useless whitespace", @"^[\s]+", null),
new LexerRule(TokenType.Identifier, "Single-quoted name", "^\\s*'(.*?)'\\s*(?=:)", m => m.Groups[1].Value),
new LexerRule(TokenType.Identifier, "Double-quoted name", "^\\s*\"(.*?)\"\\s*(?=:)", m => m.Groups[1].Value),
new LexerRule(TokenType.Identifier, "Unquoted name", "^\\s*([A-Za-z0-9_-]+)\\s*(?=:)", m => m.Groups[1].Value),
new LexerRule(TokenType.String, "Double-quoted string value", "^\"(.*?)\"", null),
new LexerRule(TokenType.String, "Single-quoted string value", "^'(.*?)'", null)
// new LexerRule(TokenType.CompoundBegin, COMPOUND_START),
// new LexerRule(TokenType.CompoundEnd, COMPOUND_END),
// new LexerRule(TokenType.Identifier, IDENTIFIER_DOUBLE_QUOTES),
// new LexerRule(TokenType.Identifier, IDENTIFIER_SINGLE_QUOTES),
// new LexerRule(TokenType.Identifier, IDENTIFIER_NO_QUOTES),
// new LexerRule(TokenType.String, STRING_DOUBLE_QUOTED),
// new LexerRule(TokenType.String, STRING_SINGLE_QUOTED),
// new LexerRule(TokenType.Separator, SEPARATOR),
// new LexerRule(TokenType.Comma, COMMA),
// new LexerRule(TokenType.ByteArray, @"\[B;[\s]*?"),
// new LexerRule(TokenType.IntArray, @"\[I;[\s]*?"),
// new LexerRule(TokenType.LongArray, @"\[L;[\s]*?"),
// new LexerRule(TokenType.ListArray, @"\[[\s]*?"),
// new LexerRule(TokenType.EndArray, @"[\s]*\]"),
// new LexerRule(TokenType.Float, @"-?[0-9]*\.[0-9]+[Ff]"),
// new LexerRule(TokenType.Double, @"-?[0-9]*\.[0-9]+[Dd]?"),
// new LexerRule(TokenType.Byte, "-?([0-9]+)[Bb]"),
// new LexerRule(TokenType.Short, "-?([0-9]+)[Ss]"),
// new LexerRule(TokenType.Long, "-?([0-9]+)[Ll]"),
// new LexerRule(TokenType.Int, "-?([0-9]+)"),
// new LexerRule(TokenType.WhiteSpace, @"[\s]+"),
// new LexerRule(TokenType.String, @"[\S]+"),
// new LexerRule(TokenType.Char, ".")
};
}
private static string Process(Match match)
{
throw new NotImplementedException();
}
public Lexer()
{
ruleList = new List<LexerRule>();
}
public IEnumerable<Token> Tokenize(string input)
{
string.Create(input.Length, input, (span, i) =>
{
});
var pos = 0;
public void AddRule(TokenType type, string pattern, bool skipped = false) => ruleList.Add(new LexerRule(type, pattern, null, skipped));
do
{
Label:
foreach (var rule in rules)
{
var match = rule.Matcher.Match(input, pos);
if (match.Success)
{
yield return new Token(rule.Type, rule.PostProcess(match));
pos = match.Index + match.Length - 1;
break;
}
}
} while (++pos < input.Length);
public void AddRule(TokenType type, string pattern, ResultHandler handler, bool skipped = false)
{
ruleList.Add(new LexerRule(type, pattern, handler, skipped));
}
public IEnumerable<Token> Tokenize(string source)
{
var index = 0;
while (index < source.Length)
{
var success = false;
foreach (var rule in ruleList)
{
var match = rule.Pattern.Match(source, index);
if (!match.Success || match.Index - index != 0)
continue;
if (!rule.IsSkipped)
yield return new Token(rule.Type, rule.Process(source, index, match));
index += match.Length;
success = true;
break;
}
if (!success)
throw new SyntaxErrorException($"Unrecognized sequence at index {index}: '{source[index]}'");
}
}
}
}

View File

@ -0,0 +1,37 @@
using System;
using System.Text.RegularExpressions;
namespace SharpNBT.SNBT
{
internal delegate string ResultHandler(Match match);
internal class LexerRule
{
private readonly ResultHandler processResult;
public TokenType Type { get; }
public Regex Pattern { get; }
public bool IsSkipped { get; }
public LexerRule(TokenType type, string pattern, bool skipped = false) : this(type, pattern, null, skipped)
{
}
public LexerRule(TokenType type, string pattern, ResultHandler handler, bool skipped = false)
{
Type = type;
Pattern = new Regex(pattern);
IsSkipped = skipped;
processResult = handler;
}
public string Process(string source, int index, Match match)
{
return processResult is null ? source.Substring(index, match.Length) : processResult.Invoke(match);
}
}
}

174
SharpNBT/SNBT/StringNbt.cs Normal file
View File

@ -0,0 +1,174 @@
using System;
using System.Collections.Generic;
using System.Data;
using System.Text.RegularExpressions;
using JetBrains.Annotations;
namespace SharpNBT.SNBT
{
public static class StringNbt
{
private static readonly Lexer lexer;
static StringNbt()
{
lexer = new Lexer();
lexer.AddRule(TokenType.Whitespace, @"(\r|\t|\v|\f|\s)+?", true);
lexer.AddRule(TokenType.Separator, ",", true);
lexer.AddRule(TokenType.Compound, @"{");
lexer.AddRule(TokenType.EndCompound, @"}");
lexer.AddRule(TokenType.Identifier, "\"(.*?)\"\\s*(?>:)", FirstGroupValue);
lexer.AddRule(TokenType.Identifier, "'(.*?)'\\s*(?>:)", FirstGroupValue);
lexer.AddRule(TokenType.Identifier, "([A-Za-z0-9_-]+)\\s*(?>:)", FirstGroupValue);
lexer.AddRule(TokenType.String, "\"(.*?)\"", FirstGroupValue);
lexer.AddRule(TokenType.String, "'(.*?)'", FirstGroupValue);
lexer.AddRule(TokenType.ByteArray, @"\[B;");
lexer.AddRule(TokenType.IntArray, @"\[I;");
lexer.AddRule(TokenType.LongArray, @"\[L;");
lexer.AddRule(TokenType.List, @"\[");
lexer.AddRule(TokenType.EndArray, @"\]");
lexer.AddRule(TokenType.Float, @"(-?[0-9]*\.[0-9]+)[Ff]", FirstGroupValue);
lexer.AddRule(TokenType.Double, @"(-?[0-9]*\.[0-9]+)[Dd]?", FirstGroupValue);
lexer.AddRule(TokenType.Byte, "(-?[0-9]+)[Bb]", FirstGroupValue);
lexer.AddRule(TokenType.Short, "(-?[0-9]+)[Ss]", FirstGroupValue);
lexer.AddRule(TokenType.Long, "(-?[0-9]+)[Ll]", FirstGroupValue);
lexer.AddRule(TokenType.Int, "(-?[0-9]+)", FirstGroupValue);
}
private static string FirstGroupValue(Match match) => match.Groups[1].Value;
public static CompoundTag Parse([NotNull] string source)
{
if (source is null)
throw new ArgumentNullException(nameof(source));
if (string.IsNullOrWhiteSpace(source))
return new CompoundTag(null);
var queue = new Queue<Token>(lexer.Tokenize(source));
return Parse<CompoundTag>(queue);
}
private static T Parse<T>(Queue<Token> queue) where T : Tag => (T)Parse(queue);
private static Tag Parse(Queue<Token> queue)
{
string name = null;
var token = MoveNext(queue);
if (token.Type == TokenType.Identifier)
{
name = token.Value;
token = MoveNext(queue);
}
return token.Type switch
{
TokenType.Compound => ParseCompound(name, queue),
TokenType.String => new StringTag(name, token.Value),
TokenType.ByteArray => ParseByteArray(name, queue),
TokenType.IntArray => ParseIntArray(name, queue),
TokenType.LongArray => ParseLongArray(name, queue),
TokenType.List => ParseList(name, queue),
TokenType.Byte => new ByteTag(name, sbyte.Parse(token.Value)),
TokenType.Short => new ShortTag(name, short.Parse(token.Value)),
TokenType.Int => new IntTag(name, int.Parse(token.Value)),
TokenType.Long => new LongTag(name, long.Parse(token.Value)),
TokenType.Float => new FloatTag(name, float.Parse(token.Value)),
TokenType.Double => new DoubleTag(name, double.Parse(token.Value)),
_ => throw new SyntaxErrorException()
};
}
[NotNull]
private static Token MoveNext(Queue<Token> queue)
{
if (queue.TryDequeue(out var token))
return token;
throw new SyntaxErrorException("Unexpected end-of-input");
}
private static void MoveNext(Queue<Token> queue, TokenType assertType)
{
var token = MoveNext(queue);
if (token.Type != assertType)
throw new SyntaxErrorException($"Expected token of type {assertType}, but encountered {token.Type}.");
}
private static CompoundTag ParseCompound(string name, Queue<Token> queue)
{
var compound = new CompoundTag(name);
while (queue.TryPeek(out var token) && token.Type != TokenType.EndCompound)
{
compound.Add(Parse(queue));
}
MoveNext(queue, TokenType.EndCompound);
return compound;
}
private static ListTag ParseList(string name, Queue<Token> queue)
{
var values = new List<Tag>();
while (queue.TryPeek(out var token) && token.Type != TokenType.EndArray)
{
values.Add(Parse(queue));
}
MoveNext(queue, TokenType.EndArray);
if (values.Count > 0)
{
var type = values[0].Type;
return new ListTag(name, type, values);
}
return new ListTag(name, TagType.End);
}
private static ByteArrayTag ParseByteArray(string name, Queue<Token> queue)
{
var values = new List<byte>();
foreach (var token in DequeueUntil(queue, TokenType.EndArray))
{
if (token.Type != TokenType.Byte)
throw new SyntaxErrorException($"Invalid token type in array, expected {TokenType.Byte}, got {token.Type}.");
values.Add(unchecked((byte) sbyte.Parse(token.Value)));
}
return new ByteArrayTag(name, values);
}
private static IntArrayTag ParseIntArray(string name, Queue<Token> queue)
{
var values = new List<int>();
foreach (var token in DequeueUntil(queue, TokenType.EndArray))
{
if (token.Type != TokenType.Int)
throw new SyntaxErrorException($"Invalid token type in array, expected {TokenType.Int}, got {token.Type}.");
values.Add(int.Parse(token.Value));
}
return new IntArrayTag(name, values);
}
private static LongArrayTag ParseLongArray(string name, Queue<Token> queue)
{
var values = new List<long>();
foreach (var token in DequeueUntil(queue, TokenType.EndArray))
{
if (token.Type != TokenType.Long)
throw new SyntaxErrorException($"Invalid token type in array, expected {TokenType.Long}, got {token.Type}.");
values.Add(long.Parse(token.Value));
}
return new LongArrayTag(name, values);
}
private static IEnumerable<Token> DequeueUntil(Queue<Token> queue, TokenType type)
{
while (true)
{
var token = MoveNext(queue);
if (token.Type == type)
yield break;
yield return token;
}
}
}
}

35
SharpNBT/SNBT/Token.cs Normal file
View File

@ -0,0 +1,35 @@
using JetBrains.Annotations;
namespace SharpNBT.SNBT
{
/// <summary>
/// An object emitted by the lexer to describe a logical fragment of code that can be parsed.
/// </summary>
[PublicAPI]
public sealed class Token
{
/// <summary>
/// Gets a value describing the general type code fragment this <see cref="Token"/> represents.
/// </summary>
public TokenType Type { get; }
/// <summary>
/// Gets a value of this fragment, which can vary depending on context and the <see cref="Type"/>.
/// </summary>
public string Value { get; }
/// <summary>
/// Creates a new instance of the <see cref="Token"/> class.
/// </summary>
/// <param name="type">A value describing the general type code fragment this <see cref="Token"/> represents.</param>
/// <param name="value">Ahe value of this code fragment.</param>
public Token(TokenType type, [NotNull] string value)
{
Type = type;
Value = value;
}
/// <inheritdoc />
public override string ToString() => $"[{Type}] \"{Value}\"";
}
}

View File

@ -0,0 +1,98 @@
using JetBrains.Annotations;
namespace SharpNBT.SNBT
{
/// <summary>
/// Describes types of tokens that the SNBT lexer can emit.
/// </summary>
[PublicAPI]
public enum TokenType
{
/// <summary>
/// Any whitespace/newline not found within a string or identifier.
/// </summary>
/// <remarks>This type is not yielded during tokenization.</remarks>
Whitespace,
/// <summary>
/// A separator between objects and array elements.
/// </summary>
/// <remarks>This type is not yielded during tokenization.</remarks>
Separator,
/// <summary>
/// The beginning of new <see cref="CompoundTag"/> object.
/// </summary>
Compound,
/// <summary>
/// The end of a <see cref="CompoundTag"/>.
/// </summary>
EndCompound,
/// <summary>
/// The name of an tag.
/// </summary>
Identifier,
/// <summary>
/// A <see cref="StringTag"/> value, which may be contain escaped quotes.
/// </summary>
String,
/// <summary>
/// The beginning of a <see cref="ByteArrayTag"/>.
/// </summary>
ByteArray,
/// <summary>
/// The beginning of a <see cref="IntArrayTag"/>.
/// </summary>
IntArray,
/// <summary>
/// The beginning of a <see cref="LongArrayTag"/>.
/// </summary>
LongArray,
/// <summary>
/// The beginning of a <see cref="ListTag"/>.
/// </summary>
List,
/// <summary>
/// The end of a <see cref="ByteArrayTag"/>, <see cref="IntArrayTag"/>, <see cref="LongArrayTag"/> or <see cref="ListTag"/>.
/// </summary>
EndArray,
/// <summary>
/// A <see cref="ByteTag"/> value or element of a <see cref="ByteArrayTag"/> depending on context.
/// </summary>
Byte,
/// <summary>
/// A <see cref="ShortTag"/> value.
/// </summary>
Short,
/// <summary>
/// A <see cref="IntTag"/> value or element of a <see cref="IntArrayTag"/> depending on context.
/// </summary>
Int,
/// <summary>
/// A <see cref="LongTag"/> value or element of a <see cref="LongArrayTag"/> depending on context.
/// </summary>
Long,
/// <summary>
/// A <see cref="FloatTag"/> value.
/// </summary>
Float,
/// <summary>
/// A <see cref="DoubleTag"/> value.
/// </summary>
Double
}
}