Browse Source

Encode unicode tries as ReadOnlySpan<uint> (#15074)

release/11.1.0-beta2
Julien Lebosquain 2 years ago
committed by Max Katz
parent
commit
2cd500f667
  1. 1595
      src/Avalonia.Base/Media/TextFormatting/Unicode/BiDi.trie.cs
  2. 1271
      src/Avalonia.Base/Media/TextFormatting/Unicode/GraphemeBreak.trie.cs
  3. 28
      src/Avalonia.Base/Media/TextFormatting/Unicode/UnicodeData.cs
  4. 3383
      src/Avalonia.Base/Media/TextFormatting/Unicode/UnicodeData.trie.cs
  5. 131
      src/Avalonia.Base/Media/TextFormatting/Unicode/UnicodeTrie.cs
  6. 13
      src/Avalonia.Base/Media/TextFormatting/Unicode/UnicodeTrieBuilder.cs
  7. 6
      tests/Avalonia.Base.UnitTests/Media/TextFormatting/GraphemeBreakClassTrieGeneratorTests.cs
  8. 92
      tests/Avalonia.Base.UnitTests/Media/TextFormatting/UnicodeDataGenerator.cs

1595
src/Avalonia.Base/Media/TextFormatting/Unicode/BiDi.trie.cs

File diff suppressed because it is too large

1271
src/Avalonia.Base/Media/TextFormatting/Unicode/GraphemeBreak.trie.cs

File diff suppressed because it is too large

28
src/Avalonia.Base/Media/TextFormatting/Unicode/UnicodeData.cs

@ -1,5 +1,4 @@
using System.IO;
using System.Runtime.CompilerServices;
using System.Runtime.CompilerServices;
namespace Avalonia.Media.TextFormatting.Unicode
{
@ -30,17 +29,6 @@ namespace Avalonia.Media.TextFormatting.Unicode
internal const int BIDIPAIREDBRACKEDTYPE_MASK = (1 << BIDIPAIREDBRACKEDTYPE_BITS) - 1;
internal const int BIDICLASS_MASK = (1 << BIDICLASS_BITS) - 1;
private static readonly UnicodeTrie s_unicodeDataTrie;
private static readonly UnicodeTrie s_graphemeBreakTrie;
private static readonly UnicodeTrie s_biDiTrie;
static UnicodeData()
{
s_unicodeDataTrie = new UnicodeTrie(UnicodeDataTrie.Data);
s_graphemeBreakTrie = new UnicodeTrie(GraphemeBreakTrie.Data);
s_biDiTrie = new UnicodeTrie(BidiTrie.Data);
}
/// <summary>
/// Gets the <see cref="GeneralCategory"/> for a Unicode codepoint.
/// </summary>
@ -49,7 +37,7 @@ namespace Avalonia.Media.TextFormatting.Unicode
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static GeneralCategory GetGeneralCategory(uint codepoint)
{
return (GeneralCategory)(s_unicodeDataTrie.Get(codepoint) & CATEGORY_MASK);
return (GeneralCategory)(UnicodeDataTrie.Trie.Get(codepoint) & CATEGORY_MASK);
}
/// <summary>
@ -60,7 +48,7 @@ namespace Avalonia.Media.TextFormatting.Unicode
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Script GetScript(uint codepoint)
{
return (Script)((s_unicodeDataTrie.Get(codepoint) >> SCRIPT_SHIFT) & SCRIPT_MASK);
return (Script)((UnicodeDataTrie.Trie.Get(codepoint) >> SCRIPT_SHIFT) & SCRIPT_MASK);
}
/// <summary>
@ -71,7 +59,7 @@ namespace Avalonia.Media.TextFormatting.Unicode
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static BidiClass GetBiDiClass(uint codepoint)
{
return (BidiClass)((s_biDiTrie.Get(codepoint) >> BIDICLASS_SHIFT) & BIDICLASS_MASK);
return (BidiClass)((BidiTrie.Trie.Get(codepoint) >> BIDICLASS_SHIFT) & BIDICLASS_MASK);
}
/// <summary>
@ -82,7 +70,7 @@ namespace Avalonia.Media.TextFormatting.Unicode
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static BidiPairedBracketType GetBiDiPairedBracketType(uint codepoint)
{
return (BidiPairedBracketType)((s_biDiTrie.Get(codepoint) >> BIDIPAIREDBRACKEDTYPE_SHIFT) & BIDIPAIREDBRACKEDTYPE_MASK);
return (BidiPairedBracketType)((BidiTrie.Trie.Get(codepoint) >> BIDIPAIREDBRACKEDTYPE_SHIFT) & BIDIPAIREDBRACKEDTYPE_MASK);
}
/// <summary>
@ -93,7 +81,7 @@ namespace Avalonia.Media.TextFormatting.Unicode
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Codepoint GetBiDiPairedBracket(uint codepoint)
{
return new Codepoint((s_biDiTrie.Get(codepoint) & BIDIPAIREDBRACKED_MASK));
return new Codepoint(BidiTrie.Trie.Get(codepoint) & BIDIPAIREDBRACKED_MASK);
}
/// <summary>
@ -104,7 +92,7 @@ namespace Avalonia.Media.TextFormatting.Unicode
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static LineBreakClass GetLineBreakClass(uint codepoint)
{
return (LineBreakClass)((s_unicodeDataTrie.Get(codepoint) >> LINEBREAK_SHIFT) & LINEBREAK_MASK);
return (LineBreakClass)((UnicodeDataTrie.Trie.Get(codepoint) >> LINEBREAK_SHIFT) & LINEBREAK_MASK);
}
/// <summary>
@ -115,7 +103,7 @@ namespace Avalonia.Media.TextFormatting.Unicode
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static GraphemeBreakClass GetGraphemeClusterBreak(uint codepoint)
{
return (GraphemeBreakClass)s_graphemeBreakTrie.Get(codepoint);
return (GraphemeBreakClass)GraphemeBreakTrie.Trie.Get(codepoint);
}
}
}

3383
src/Avalonia.Base/Media/TextFormatting/Unicode/UnicodeData.trie.cs

File diff suppressed because it is too large

131
src/Avalonia.Base/Media/TextFormatting/Unicode/UnicodeTrie.cs

@ -16,98 +16,25 @@
// Copied from: https://github.com/toptensoftware/RichTextKit
using System;
using System.IO;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Text;
namespace Avalonia.Media.TextFormatting.Unicode
{
internal class UnicodeTrie
internal ref struct UnicodeTrie
{
private readonly uint[] _data;
private readonly int _highStart;
private readonly uint _errorValue;
/// <summary>
/// Initializes a new instance of the <see cref="UnicodeTrie"/> class.
/// </summary>
/// <param name="rawData">The uncompressed trie data.</param>
public UnicodeTrie(ReadOnlySpan<byte> rawData)
{
var header = UnicodeTrieHeader.Parse(rawData);
int length = header.DataLength;
uint[] data = new uint[length / sizeof(uint)];
MemoryMarshal.Cast<byte, uint>(rawData.Slice(rawData.Length - length))
.CopyTo(data);
_highStart = header.HighStart;
_errorValue = header.ErrorValue;
_data = data;
}
/// <summary>
/// Initializes a new instance of the <see cref="UnicodeTrie"/> class.
/// </summary>
/// <param name="stream">The stream containing the data.</param>
public UnicodeTrie(Stream stream)
public UnicodeTrie(ReadOnlySpan<uint> data, int highStart, uint errorValue)
{
// Read the header info
using (var br = new BinaryReader(stream, Encoding.UTF8, true))
{
_highStart = br.ReadInt32();
_errorValue = br.ReadUInt32();
_data = new uint[br.ReadInt32() / sizeof(uint)];
}
// Read the data in compressed format.
using (var br = new BinaryReader(stream, Encoding.UTF8, true))
{
for (int i = 0; i < _data.Length; i++)
{
_data[i] = br.ReadUInt32();
}
}
Data = data;
HighStart = highStart;
ErrorValue = errorValue;
}
public ReadOnlySpan<uint> Data { get; }
/// <summary>
/// Initializes a new instance of the <see cref="UnicodeTrie"/> class.
/// </summary>
/// <param name="data">The uncompressed trie data.</param>
/// <param name="highStart">The start of the last range which ends at U+10ffff.</param>
/// <param name="errorValue">The value for out-of-range code points and illegal UTF-8.</param>
public UnicodeTrie(uint[] data, int highStart, uint errorValue)
{
_data = data;
_highStart = highStart;
_errorValue = errorValue;
}
/// <summary>
/// Saves the <see cref="UnicodeTrie"/> to the stream in a compressed format.
/// </summary>
/// <param name="stream">The output stream.</param>
internal void Save(Stream stream)
{
// Write the header info
using (var bw = new BinaryWriter(stream, Encoding.UTF8, true))
{
bw.Write(_highStart);
bw.Write(_errorValue);
bw.Write(_data.Length * sizeof(uint));
}
public int HighStart { get; }
// Write the data.
using (var bw = new BinaryWriter(stream, Encoding.UTF8, true))
{
for (int i = 0; i < _data.Length; i++)
{
bw.Write(_data[i]);
}
}
}
public uint ErrorValue { get; }
/// <summary>
/// Get the value for a code point as stored in the trie.
@ -118,14 +45,14 @@ namespace Avalonia.Media.TextFormatting.Unicode
public uint Get(uint codePoint)
{
uint index;
ref uint dataBase = ref MemoryMarshal.GetReference(_data.AsSpan());
ref uint dataBase = ref MemoryMarshal.GetReference(Data);
if (codePoint is < 0x0d800 or (> 0x0dbff and <= 0x0ffff))
{
// Ordinary BMP code point, excluding leading surrogates.
// BMP uses a single level lookup. BMP index starts at offset 0 in the Trie2 index.
// 16 bit data is stored in the index array itself.
index = _data[codePoint >> UnicodeTrieBuilder.SHIFT_2];
index = Data[(int)(codePoint >> UnicodeTrieBuilder.SHIFT_2)];
index = (index << UnicodeTrieBuilder.INDEX_SHIFT) + (codePoint & UnicodeTrieBuilder.DATA_MASK);
return Unsafe.Add(ref dataBase, (nint)index);
}
@ -138,55 +65,29 @@ namespace Avalonia.Media.TextFormatting.Unicode
// For this function, we need the code point data.
// Note: this expression could be refactored for slightly improved efficiency, but
// surrogate code points will be so rare in practice that it's not worth it.
index = _data[UnicodeTrieBuilder.LSCP_INDEX_2_OFFSET + ((codePoint - 0xd800) >> UnicodeTrieBuilder.SHIFT_2)];
index = Data[(int)(UnicodeTrieBuilder.LSCP_INDEX_2_OFFSET + ((codePoint - 0xd800) >> UnicodeTrieBuilder.SHIFT_2))];
index = (index << UnicodeTrieBuilder.INDEX_SHIFT) + (codePoint & UnicodeTrieBuilder.DATA_MASK);
return Unsafe.Add(ref dataBase, (nint)index);
}
if (codePoint < _highStart)
if (codePoint < HighStart)
{
// Supplemental code point, use two-level lookup.
index = UnicodeTrieBuilder.INDEX_1_OFFSET - UnicodeTrieBuilder.OMITTED_BMP_INDEX_1_LENGTH + (codePoint >> UnicodeTrieBuilder.SHIFT_1);
index = _data[index];
index = Data[(int)index];
index += (codePoint >> UnicodeTrieBuilder.SHIFT_2) & UnicodeTrieBuilder.INDEX_2_MASK;
index = _data[index];
index = Data[(int)index];
index = (index << UnicodeTrieBuilder.INDEX_SHIFT) + (codePoint & UnicodeTrieBuilder.DATA_MASK);
return Unsafe.Add(ref dataBase, (nint)index);
}
if (codePoint <= 0x10ffff)
{
return Unsafe.Add(ref dataBase, (nint)(_data.Length - UnicodeTrieBuilder.DATA_GRANULARITY));
return Data[Data.Length - UnicodeTrieBuilder.DATA_GRANULARITY];
}
// Fall through. The code point is outside of the legal range of 0..0x10ffff.
return _errorValue;
}
[StructLayout(LayoutKind.Sequential, Pack = 1)]
private struct UnicodeTrieHeader
{
public int HighStart
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get;
}
public uint ErrorValue
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get;
}
public int DataLength
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static UnicodeTrieHeader Parse(ReadOnlySpan<byte> data)
=> MemoryMarshal.Cast<byte, UnicodeTrieHeader>(data)[0];
return ErrorValue;
}
}
}

13
src/Avalonia.Base/Media/TextFormatting/Unicode/UnicodeTrieBuilder.cs

@ -325,19 +325,6 @@ namespace Avalonia.Media.TextFormatting.Unicode
return _data[block + (c & DATA_MASK)];
}
public byte[] ToBuffer()
{
var mem = new MemoryStream();
Save(mem);
return mem.GetBuffer();
}
public void Save(Stream stream)
{
var trie = this.Freeze();
trie.Save(stream);
}
public UnicodeTrie Freeze()
{
int allIndexesLength, i;

6
tests/Avalonia.Base.UnitTests/Media/TextFormatting/GraphemeBreakClassTrieGeneratorTests.cs

@ -22,7 +22,7 @@ namespace Avalonia.Base.UnitTests.Media.TextFormatting
_outputHelper = outputHelper;
}
[Fact(/*Skip = "Only run when we update the trie."*/)]
[Fact(Skip = "Only run when we update the trie.")]
public void Should_Enumerate()
{
var generator = new GraphemeBreakTestDataGenerator();
@ -77,7 +77,7 @@ namespace Avalonia.Base.UnitTests.Media.TextFormatting
return true;
}
[Fact(/*Skip = "Only run when we update the trie."*/)]
[Fact(Skip = "Only run when we update the trie.")]
public void Should_Enumerate_Other()
{
const string text = "ABCDEFGHIJ";
@ -96,7 +96,7 @@ namespace Avalonia.Base.UnitTests.Media.TextFormatting
Assert.Equal(10, count);
}
[Fact(/*Skip = "Only run when we update the trie."*/)]
[Fact(Skip = "Only run when we update the trie.")]
public void Should_Generate_Trie()
{
GraphemeBreakClassTrieGenerator.Execute();

92
tests/Avalonia.Base.UnitTests/Media/TextFormatting/UnicodeDataGenerator.cs

@ -4,7 +4,6 @@ using System.IO;
using System.Net.Http;
using System.Text.RegularExpressions;
using Avalonia.Media.TextFormatting.Unicode;
using Xunit;
namespace Avalonia.Base.UnitTests.Media.TextFormatting
{
@ -68,58 +67,57 @@ namespace Avalonia.Base.UnitTests.Media.TextFormatting
public static void GenerateTrieClass(string name, UnicodeTrie trie)
{
var stream = new MemoryStream();
trie.Save(stream);
using (var fileStream = File.Create($"Generated\\{name}.trie.cs"))
using (var writer = new StreamWriter(fileStream))
using var fileStream = File.Create($"Generated\\{name}.trie.cs");
using var writer = new StreamWriter(fileStream);
writer.Write(
$$"""
//---------------------------------------------------------------------------------------------------
// <auto-generated>
// This code was generated by UnicodeDataGenerator.
// Changes to this file may cause incorrect behavior and will be lost if the code is regenerated.
// </auto-generated>"
//---------------------------------------------------------------------------------------------------
using System;
using System.Runtime.CompilerServices;
namespace Avalonia.Media.TextFormatting.Unicode;
internal static class {{name}}Trie
{
public static UnicodeTrie Trie
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => new(Data, 0x{{trie.HighStart:X8}}, 0x{{trie.ErrorValue:X8}});
}
private static ReadOnlySpan<uint> Data => new uint[]
{
""");
for (int i = 0; i < trie.Data.Length; ++i)
{
writer.WriteLine("using System;");
writer.WriteLine("namespace Avalonia.Media.TextFormatting.Unicode");
writer.WriteLine("{");
writer.WriteLine($" internal static class {name}Trie");
writer.WriteLine(" {");
writer.WriteLine(" public static ReadOnlySpan<byte> Data => new byte[]");
writer.WriteLine(" {");
stream.Position = 0;
writer.Write(" ");
long length = stream.Length;
if (i > 0)
writer.Write(", ");
while (true)
if (i % 12 == 0)
{
var b = stream.ReadByte();
if(b == -1)
{
break;
}
writer.Write(b.ToString());
if (stream.Position % 100 > 0 && stream.Position != length)
{
writer.Write(", ");
}
else
{
writer.Write(',');
writer.Write(Environment.NewLine);
if (stream.Position != length)
{
writer.Write(" ");
}
}
writer.WriteLine();
writer.Write(" ");
}
writer.WriteLine(" };");
writer.WriteLine(" }");
writer.WriteLine("}");
writer.Write("0x");
writer.Write(trie.Data[i].ToString("X8"));
}
writer.Write(
"""
};
}
""");
}
public static UnicodeTrie GenerateUnicodeDataTrie(out UnicodeDataEntries dataEntries, out Dictionary<int, UnicodeDataItem> unicodeData)

Loading…
Cancel
Save