csharpc-sharpdotnetxamlavaloniauicross-platformcross-platform-xamlavaloniaguimulti-platformuser-interfacedotnetcore
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
984 lines
34 KiB
984 lines
34 KiB
// RichTextKit
|
|
// Copyright © 2019 Topten Software. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
// not use this product except in compliance with the License. You may obtain
|
|
// a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
// License for the specific language governing permissions and limitations
|
|
// under the License.
|
|
// Ported from: https://github.com/foliojs/unicode-trie
|
|
// Copied from: https://github.com/toptensoftware/RichTextKit
|
|
|
|
using System;
|
|
using System.Collections.Generic;
|
|
using System.IO;
|
|
|
|
namespace Avalonia.Media.TextFormatting.Unicode
|
|
{
|
|
internal partial class UnicodeTrieBuilder
|
|
{
|
|
private readonly uint _initialValue;
|
|
private readonly uint _errorValue;
|
|
private readonly int[] _index1;
|
|
private readonly int[] _index2;
|
|
private int _highStart;
|
|
private uint[] _data;
|
|
private int _dataCapacity;
|
|
private int _firstFreeBlock;
|
|
private bool _isCompacted;
|
|
private readonly int[] _map;
|
|
private int _dataNullOffset;
|
|
private int _dataLength;
|
|
private int _index2NullOffset;
|
|
private int _index2Length;
|
|
|
|
public UnicodeTrieBuilder(uint initialValue = 0, uint errorValue = 0)
|
|
{
|
|
_initialValue = initialValue;
|
|
_errorValue = errorValue;
|
|
_index1 = new int[INDEX_1_LENGTH];
|
|
_index2 = new int[MAX_INDEX_2_LENGTH];
|
|
_highStart = 0x110000;
|
|
|
|
_data = new uint[INITIAL_DATA_LENGTH];
|
|
_dataCapacity = INITIAL_DATA_LENGTH;
|
|
|
|
_firstFreeBlock = 0;
|
|
_isCompacted = false;
|
|
|
|
// Multi-purpose per-data-block table.
|
|
//
|
|
// Before compacting:
|
|
//
|
|
// Per-data-block reference counters/free-block list.
|
|
// 0: unused
|
|
// >0: reference counter (number of index-2 entries pointing here)
|
|
// <0: next free data block in free-block list
|
|
//
|
|
// While compacting:
|
|
//
|
|
// Map of adjusted indexes, used in compactData() and compactIndex2().
|
|
// Maps from original indexes to new ones.
|
|
_map = new int[MAX_DATA_LENGTH_BUILDTIME >> SHIFT_2];
|
|
|
|
int i;
|
|
for (i = 0; i < 0x80; i++)
|
|
{
|
|
_data[i] = _initialValue;
|
|
}
|
|
|
|
for (; i < 0xc0; i++)
|
|
{
|
|
_data[i] = _errorValue;
|
|
}
|
|
|
|
for (i = DATA_NULL_OFFSET; i < NEW_DATA_START_OFFSET; i++)
|
|
{
|
|
_data[i] = _initialValue;
|
|
}
|
|
|
|
_dataNullOffset = DATA_NULL_OFFSET;
|
|
_dataLength = NEW_DATA_START_OFFSET;
|
|
|
|
// set the index-2 indexes for the 2=0x80>>SHIFT_2 ASCII data blocks
|
|
int j;
|
|
i = 0;
|
|
for (j = 0; j < 0x80; j += DATA_BLOCK_LENGTH) {
|
|
_index2[i] = j;
|
|
_map[i++] = 1;
|
|
}
|
|
|
|
// reference counts for the bad-UTF-8-data block
|
|
for (; j < 0xc0; j += DATA_BLOCK_LENGTH) {
|
|
_map[i++] = 0;
|
|
}
|
|
|
|
// Reference counts for the null data block: all blocks except for the ASCII blocks.
|
|
// Plus 1 so that we don't drop this block during compaction.
|
|
// Plus as many as needed for lead surrogate code points.
|
|
// i==newTrie->dataNullOffset
|
|
_map[i++] = ((0x110000 >> SHIFT_2) - (0x80 >> SHIFT_2)) + 1 + LSCP_INDEX_2_LENGTH;
|
|
j += DATA_BLOCK_LENGTH;
|
|
for (; j < NEW_DATA_START_OFFSET; j += DATA_BLOCK_LENGTH) {
|
|
_map[i++] = 0;
|
|
}
|
|
|
|
// set the remaining indexes in the BMP index-2 block
|
|
// to the null data block
|
|
for (i = 0x80 >> SHIFT_2; i < INDEX_2_BMP_LENGTH; i++) {
|
|
_index2[i] = DATA_NULL_OFFSET;
|
|
}
|
|
|
|
// Fill the index gap with impossible values so that compaction
|
|
// does not overlap other index-2 blocks with the gap.
|
|
for (i = 0; i < INDEX_GAP_LENGTH; i++) {
|
|
_index2[INDEX_GAP_OFFSET + i] = -1;
|
|
}
|
|
|
|
// set the indexes in the null index-2 block
|
|
for (i = 0; i < INDEX_2_BLOCK_LENGTH; i++) {
|
|
_index2[INDEX_2_NULL_OFFSET + i] = DATA_NULL_OFFSET;
|
|
}
|
|
|
|
_index2NullOffset = INDEX_2_NULL_OFFSET;
|
|
_index2Length = INDEX_2_START_OFFSET;
|
|
|
|
// set the index-1 indexes for the linear index-2 block
|
|
j = 0;
|
|
for (i = 0; i < OMITTED_BMP_INDEX_1_LENGTH; i++) {
|
|
_index1[i] = j;
|
|
j += INDEX_2_BLOCK_LENGTH;
|
|
}
|
|
|
|
// set the remaining index-1 indexes to the null index-2 block
|
|
for (; i < INDEX_1_LENGTH; i++) {
|
|
_index1[i] = INDEX_2_NULL_OFFSET;
|
|
}
|
|
|
|
// Preallocate and reset data for U+0080..U+07ff,
|
|
// for 2-byte UTF-8 which will be compacted in 64-blocks
|
|
// even if DATA_BLOCK_LENGTH is smaller.
|
|
for (i = 0x80; i < 0x800; i += DATA_BLOCK_LENGTH) {
|
|
Set(i, _initialValue);
|
|
}
|
|
|
|
}
|
|
|
|
public UnicodeTrieBuilder Set(int codePoint, uint value)
|
|
{
|
|
if ((codePoint < 0) || (codePoint > 0x10ffff))
|
|
{
|
|
throw new InvalidOperationException("Invalid code point");
|
|
}
|
|
|
|
if (_isCompacted)
|
|
{
|
|
throw new InvalidOperationException("Already compacted");
|
|
}
|
|
|
|
var block = GetDataBlock(codePoint, true);
|
|
_data[block + (codePoint & DATA_MASK)] = value;
|
|
return this;
|
|
}
|
|
|
|
public UnicodeTrieBuilder SetRange(int start, int end, uint value, bool overwrite = true)
|
|
{
|
|
|
|
if ((start > 0x10ffff) || (end > 0x10ffff) || (start > end))
|
|
{
|
|
throw new InvalidOperationException("Invalid code point");
|
|
}
|
|
|
|
if (_isCompacted)
|
|
{
|
|
throw new InvalidOperationException("Already compacted");
|
|
}
|
|
|
|
if (!overwrite && (value == _initialValue))
|
|
{
|
|
return this; // nothing to do
|
|
}
|
|
|
|
var limit = end + 1;
|
|
if ((start & DATA_MASK) != 0)
|
|
{
|
|
// set partial block at [start..following block boundary
|
|
var block = GetDataBlock(start, true);
|
|
|
|
var nextStart = (start + DATA_BLOCK_LENGTH) & ~DATA_MASK;
|
|
if (nextStart <= limit)
|
|
{
|
|
FillBlock(block, start & DATA_MASK, DATA_BLOCK_LENGTH, value, _initialValue, overwrite);
|
|
start = nextStart;
|
|
}
|
|
else
|
|
{
|
|
FillBlock(block, start & DATA_MASK, limit & DATA_MASK, value, _initialValue, overwrite);
|
|
return this;
|
|
}
|
|
}
|
|
|
|
// number of positions in the last, partial block
|
|
var rest = limit & DATA_MASK;
|
|
|
|
// round down limit to a block boundary
|
|
limit &= ~DATA_MASK;
|
|
|
|
// iterate over all-value blocks
|
|
int repeatBlock;
|
|
if (value == _initialValue)
|
|
{
|
|
repeatBlock = _dataNullOffset;
|
|
}
|
|
else
|
|
{
|
|
repeatBlock = -1;
|
|
}
|
|
|
|
while (start < limit)
|
|
{
|
|
var setRepeatBlock = false;
|
|
|
|
if ((value == _initialValue) && IsInNullBlock(start, true))
|
|
{
|
|
start += DATA_BLOCK_LENGTH; // nothing to do
|
|
continue;
|
|
}
|
|
|
|
// get index value
|
|
var i2 = GetIndex2Block(start, true);
|
|
i2 += (start >> SHIFT_2) & INDEX_2_MASK;
|
|
|
|
var block = _index2[i2];
|
|
if (IsWritableBlock(block))
|
|
{
|
|
// already allocated
|
|
if (overwrite && (block >= DATA_0800_OFFSET))
|
|
{
|
|
// We overwrite all values, and it's not a
|
|
// protected (ASCII-linear or 2-byte UTF-8) block:
|
|
// replace with the repeatBlock.
|
|
setRepeatBlock = true;
|
|
}
|
|
else
|
|
{
|
|
// protected block: just write the values into this block
|
|
FillBlock(block, 0, DATA_BLOCK_LENGTH, value, _initialValue, overwrite);
|
|
}
|
|
|
|
}
|
|
else if ((_data[block] != value) && (overwrite || (block == _dataNullOffset)))
|
|
{
|
|
// Set the repeatBlock instead of the null block or previous repeat block:
|
|
//
|
|
// If !isWritableBlock() then all entries in the block have the same value
|
|
// because it's the null block or a range block (the repeatBlock from a previous
|
|
// call to utrie2_setRange32()).
|
|
// No other blocks are used multiple times before compacting.
|
|
//
|
|
// The null block is the only non-writable block with the initialValue because
|
|
// of the repeatBlock initialization above. (If value==initialValue, then
|
|
// the repeatBlock will be the null data block.)
|
|
//
|
|
// We set our repeatBlock if the desired value differs from the block's value,
|
|
// and if we overwrite any data or if the data is all initial values
|
|
// (which is the same as the block being the null block, see above).
|
|
setRepeatBlock = true;
|
|
}
|
|
|
|
if (setRepeatBlock)
|
|
{
|
|
if (repeatBlock >= 0)
|
|
{
|
|
SetIndex2Entry(i2, repeatBlock);
|
|
}
|
|
else
|
|
{
|
|
// create and set and fill the repeatBlock
|
|
repeatBlock = GetDataBlock(start, true);
|
|
WriteBlock(repeatBlock, value);
|
|
}
|
|
}
|
|
|
|
start += DATA_BLOCK_LENGTH;
|
|
}
|
|
|
|
if (rest > 0)
|
|
{
|
|
// set partial block at [last block boundary..limit
|
|
var block = GetDataBlock(start, true);
|
|
FillBlock(block, 0, rest, value, _initialValue, overwrite);
|
|
}
|
|
|
|
return this;
|
|
}
|
|
|
|
public uint Get(int c, bool fromLSCP = true)
|
|
{
|
|
if ((c < 0) || (c > 0x10ffff))
|
|
{
|
|
return _errorValue;
|
|
}
|
|
|
|
if ((c >= _highStart) && (!((c >= 0xd800) && (c < 0xdc00)) || fromLSCP))
|
|
{
|
|
return _data[_dataLength - DATA_GRANULARITY];
|
|
}
|
|
|
|
int i2;
|
|
if (((c >= 0xd800) && (c < 0xdc00)) && fromLSCP)
|
|
{
|
|
i2 = (LSCP_INDEX_2_OFFSET - (0xd800 >> SHIFT_2)) + (c >> SHIFT_2);
|
|
}
|
|
else
|
|
{
|
|
i2 = _index1[c >> SHIFT_1] + ((c >> SHIFT_2) & INDEX_2_MASK);
|
|
}
|
|
|
|
var block = _index2[i2];
|
|
return _data[block + (c & DATA_MASK)];
|
|
}
|
|
|
|
public byte[] ToBuffer()
|
|
{
|
|
var mem = new MemoryStream();
|
|
Save(mem);
|
|
return mem.GetBuffer();
|
|
}
|
|
|
|
public void Save(Stream stream)
|
|
{
|
|
var trie = this.Freeze();
|
|
trie.Save(stream);
|
|
}
|
|
|
|
public UnicodeTrie Freeze()
|
|
{
|
|
int allIndexesLength, i;
|
|
if (!_isCompacted)
|
|
{
|
|
Compact();
|
|
}
|
|
|
|
if (_highStart <= 0x10000)
|
|
{
|
|
allIndexesLength = INDEX_1_OFFSET;
|
|
}
|
|
else
|
|
{
|
|
allIndexesLength = _index2Length;
|
|
}
|
|
|
|
var dataMove = allIndexesLength;
|
|
|
|
// are indexLength and dataLength within limits?
|
|
if ((allIndexesLength > MAX_INDEX_LENGTH) || // for unshifted indexLength
|
|
((dataMove + _dataNullOffset) > 0xffff) || // for unshifted dataNullOffset
|
|
((dataMove + DATA_0800_OFFSET) > 0xffff) || // for unshifted 2-byte UTF-8 index-2 values
|
|
((dataMove + _dataLength) > MAX_DATA_LENGTH_RUNTIME))
|
|
{ // for shiftedDataLength
|
|
throw new InvalidOperationException("Trie data is too large.");
|
|
}
|
|
|
|
// calculate the sizes of, and allocate, the index and data arrays
|
|
var indexLength = allIndexesLength + _dataLength;
|
|
var data = new uint[indexLength];
|
|
|
|
// write the index-2 array values shifted right by INDEX_SHIFT, after adding dataMove
|
|
var destIdx = 0;
|
|
for (i = 0; i < INDEX_2_BMP_LENGTH; i++)
|
|
{
|
|
data[destIdx++] = (uint)((_index2[i] + dataMove) >> INDEX_SHIFT);
|
|
}
|
|
|
|
// write UTF-8 2-byte index-2 values, not right-shifted
|
|
for (i = 0; i < 0xc2 - 0xc0; i++)
|
|
{ // C0..C1
|
|
data[destIdx++] = (uint)(dataMove + BAD_UTF8_DATA_OFFSET);
|
|
}
|
|
|
|
for (; i < 0xe0 - 0xc0; i++)
|
|
{ // C2..DF
|
|
data[destIdx++] = (uint)(dataMove + _index2[i << (6 - SHIFT_2)]);
|
|
}
|
|
|
|
if (_highStart > 0x10000)
|
|
{
|
|
var index1Length = (_highStart - 0x10000) >> SHIFT_1;
|
|
var index2Offset = INDEX_2_BMP_LENGTH + UTF8_2B_INDEX_2_LENGTH + index1Length;
|
|
|
|
// write 16-bit index-1 values for supplementary code points
|
|
for (i = 0; i < index1Length; i++)
|
|
{
|
|
data[destIdx++] = (uint)(INDEX_2_OFFSET + _index1[i + OMITTED_BMP_INDEX_1_LENGTH]);
|
|
}
|
|
|
|
// write the index-2 array values for supplementary code points,
|
|
// shifted right by INDEX_SHIFT, after adding dataMove
|
|
for (i = 0; i < _index2Length - index2Offset; i++)
|
|
{
|
|
data[destIdx++] = (uint)((dataMove + _index2[index2Offset + i]) >> INDEX_SHIFT);
|
|
}
|
|
}
|
|
|
|
// write 16-bit data values
|
|
for (i = 0; i < _dataLength; i++)
|
|
{
|
|
data[destIdx++] = _data[i];
|
|
}
|
|
|
|
return new UnicodeTrie(data, _highStart, _errorValue);
|
|
}
|
|
|
|
private bool IsInNullBlock(int c, bool forLSCP)
|
|
{
|
|
int i2;
|
|
if (((c & 0xfffffc00) == 0xd800) && forLSCP)
|
|
{
|
|
i2 = (LSCP_INDEX_2_OFFSET - (0xd800 >> SHIFT_2)) + (c >> SHIFT_2);
|
|
}
|
|
else
|
|
{
|
|
i2 = _index1[c >> SHIFT_1] + ((c >> SHIFT_2) & INDEX_2_MASK);
|
|
}
|
|
|
|
var block = _index2[i2];
|
|
return block == _dataNullOffset;
|
|
}
|
|
|
|
private int AllocIndex2Block()
|
|
{
|
|
var newBlock = _index2Length;
|
|
var newTop = newBlock + INDEX_2_BLOCK_LENGTH;
|
|
if (newTop > _index2.Length)
|
|
{
|
|
// Should never occur.
|
|
// Either MAX_BUILD_TIME_INDEX_LENGTH is incorrect,
|
|
// or the code writes more values than should be possible.
|
|
throw new InvalidOperationException("Internal error in Trie2 creation.");
|
|
}
|
|
|
|
_index2Length = newTop;
|
|
Array.Copy(_index2, _index2NullOffset, _index2, newBlock, INDEX_2_BLOCK_LENGTH);
|
|
|
|
return newBlock;
|
|
}
|
|
|
|
private int GetIndex2Block(int c, bool forLSCP)
|
|
{
|
|
if ((c >= 0xd800) && (c < 0xdc00) && forLSCP)
|
|
{
|
|
return LSCP_INDEX_2_OFFSET;
|
|
}
|
|
|
|
var i1 = c >> SHIFT_1;
|
|
var i2 = _index1[i1];
|
|
if (i2 == _index2NullOffset)
|
|
{
|
|
i2 = AllocIndex2Block();
|
|
_index1[i1] = i2;
|
|
}
|
|
|
|
return i2;
|
|
}
|
|
|
|
private bool IsWritableBlock(int block)
|
|
{
|
|
return (block != _dataNullOffset) && (_map[block >> SHIFT_2] == 1);
|
|
}
|
|
|
|
private int AllocDataBlock(int copyBlock)
|
|
{
|
|
int newBlock;
|
|
if (_firstFreeBlock != 0)
|
|
{
|
|
// get the first free block
|
|
newBlock = _firstFreeBlock;
|
|
_firstFreeBlock = -_map[newBlock >> SHIFT_2];
|
|
}
|
|
else
|
|
{
|
|
// get a new block from the high end
|
|
newBlock = _dataLength;
|
|
var newTop = newBlock + DATA_BLOCK_LENGTH;
|
|
if (newTop > _dataCapacity)
|
|
{
|
|
// out of memory in the data array
|
|
int capacity;
|
|
if (_dataCapacity < MEDIUM_DATA_LENGTH)
|
|
{
|
|
capacity = MEDIUM_DATA_LENGTH;
|
|
}
|
|
else if (_dataCapacity < MAX_DATA_LENGTH_BUILDTIME)
|
|
{
|
|
capacity = MAX_DATA_LENGTH_BUILDTIME;
|
|
}
|
|
else
|
|
{
|
|
// Should never occur.
|
|
// Either MAX_DATA_LENGTH_BUILDTIME is incorrect,
|
|
// or the code writes more values than should be possible.
|
|
throw new InvalidOperationException("Internal error in Trie2 creation.");
|
|
}
|
|
|
|
var newData = new UInt32[capacity];
|
|
Array.Copy(_data, newData, _dataLength);
|
|
_data = newData;
|
|
_dataCapacity = capacity;
|
|
}
|
|
|
|
_dataLength = newTop;
|
|
}
|
|
|
|
Array.Copy(_data, copyBlock, _data, newBlock, DATA_BLOCK_LENGTH);
|
|
//_data.set(_data.subarray(copyBlock, copyBlock + DATA_BLOCK_LENGTH), newBlock);
|
|
_map[newBlock >> SHIFT_2] = 0;
|
|
return newBlock;
|
|
}
|
|
|
|
private void ReleaseDataBlock(int block)
|
|
{
|
|
// put this block at the front of the free-block chain
|
|
_map[block >> SHIFT_2] = -_firstFreeBlock;
|
|
_firstFreeBlock = block;
|
|
}
|
|
|
|
private void SetIndex2Entry(int i2, int block)
|
|
{
|
|
++_map[block >> SHIFT_2]; // increment first, in case block == oldBlock!
|
|
var oldBlock = _index2[i2];
|
|
if (--_map[oldBlock >> SHIFT_2] == 0)
|
|
{
|
|
ReleaseDataBlock(oldBlock);
|
|
}
|
|
|
|
_index2[i2] = block;
|
|
}
|
|
|
|
private int GetDataBlock(int c, bool forLSCP)
|
|
{
|
|
var i2 = GetIndex2Block(c, forLSCP);
|
|
i2 += (c >> SHIFT_2) & INDEX_2_MASK;
|
|
|
|
var oldBlock = _index2[i2];
|
|
if (IsWritableBlock(oldBlock))
|
|
{
|
|
return oldBlock;
|
|
}
|
|
|
|
// allocate a new data block
|
|
var newBlock = AllocDataBlock(oldBlock);
|
|
SetIndex2Entry(i2, newBlock);
|
|
return newBlock;
|
|
}
|
|
|
|
private void FillBlock(int block, int start, int limit, uint value, uint initialValue, bool overwrite)
|
|
{
|
|
int i;
|
|
if (overwrite)
|
|
{
|
|
for (i = block + start; i < block + limit; i++)
|
|
{
|
|
_data[i] = value;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for (i = block + start; i < block + limit; i++)
|
|
{
|
|
if (_data[i] == initialValue)
|
|
{
|
|
_data[i] = value;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
private void WriteBlock(int block, uint value)
|
|
{
|
|
var limit = block + DATA_BLOCK_LENGTH;
|
|
while (block < limit)
|
|
{
|
|
_data[block++] = value;
|
|
}
|
|
}
|
|
|
|
private int FindHighStart(uint highValue)
|
|
{
|
|
int prevBlock, prevI2Block;
|
|
|
|
// set variables for previous range
|
|
if (highValue == _initialValue)
|
|
{
|
|
prevI2Block = _index2NullOffset;
|
|
prevBlock = _dataNullOffset;
|
|
}
|
|
else
|
|
{
|
|
prevI2Block = -1;
|
|
prevBlock = -1;
|
|
}
|
|
|
|
int prev = 0x110000;
|
|
|
|
// enumerate index-2 blocks
|
|
var i1 = INDEX_1_LENGTH;
|
|
var c = prev;
|
|
while (c > 0)
|
|
{
|
|
var i2Block = _index1[--i1];
|
|
if (i2Block == prevI2Block)
|
|
{
|
|
// the index-2 block is the same as the previous one, and filled with highValue
|
|
c -= CP_PER_INDEX_1_ENTRY;
|
|
continue;
|
|
}
|
|
|
|
prevI2Block = i2Block;
|
|
if (i2Block == _index2NullOffset)
|
|
{
|
|
// this is the null index-2 block
|
|
if (highValue != _initialValue)
|
|
{
|
|
return c;
|
|
}
|
|
c -= CP_PER_INDEX_1_ENTRY;
|
|
}
|
|
else
|
|
{
|
|
// enumerate data blocks for one index-2 block
|
|
var i2 = INDEX_2_BLOCK_LENGTH;
|
|
while (i2 > 0)
|
|
{
|
|
var block = _index2[i2Block + --i2];
|
|
if (block == prevBlock)
|
|
{
|
|
// the block is the same as the previous one, and filled with highValue
|
|
c -= DATA_BLOCK_LENGTH;
|
|
continue;
|
|
}
|
|
|
|
prevBlock = block;
|
|
if (block == _dataNullOffset)
|
|
{
|
|
// this is the null data block
|
|
if (highValue != _initialValue)
|
|
{
|
|
return c;
|
|
}
|
|
c -= DATA_BLOCK_LENGTH;
|
|
}
|
|
else
|
|
{
|
|
var j = DATA_BLOCK_LENGTH;
|
|
while (j > 0)
|
|
{
|
|
var value = _data[block + --j];
|
|
if (value != highValue)
|
|
{
|
|
return c;
|
|
}
|
|
--c;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// deliver last range
|
|
return 0;
|
|
}
|
|
|
|
private int FindSameDataBlock(int dataLength, int otherBlock, int blockLength)
|
|
{
|
|
// ensure that we do not even partially get past dataLength
|
|
dataLength -= blockLength;
|
|
var block = 0;
|
|
while (block <= dataLength)
|
|
{
|
|
if (EqualSequence(_data, block, otherBlock, blockLength))
|
|
{
|
|
return block;
|
|
}
|
|
block += DATA_GRANULARITY;
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
private int FindSameIndex2Block(int index2Length, int otherBlock) {
|
|
// ensure that we do not even partially get past index2Length
|
|
index2Length -= INDEX_2_BLOCK_LENGTH;
|
|
for (var block = 0; block <= index2Length; block++)
|
|
{
|
|
if (EqualSequence(_index2, block, otherBlock, INDEX_2_BLOCK_LENGTH))
|
|
{
|
|
return block;
|
|
}
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
private void CompactData()
|
|
{
|
|
// do not compact linear-ASCII data
|
|
var newStart = DATA_START_OFFSET;
|
|
var start = 0;
|
|
var i = 0;
|
|
|
|
while (start < newStart)
|
|
{
|
|
_map[i++] = start;
|
|
start += DATA_BLOCK_LENGTH;
|
|
}
|
|
|
|
// Start with a block length of 64 for 2-byte UTF-8,
|
|
// then switch to DATA_BLOCK_LENGTH.
|
|
var blockLength = 64;
|
|
var blockCount = blockLength >> SHIFT_2;
|
|
start = newStart;
|
|
while (start < _dataLength)
|
|
{
|
|
// start: index of first entry of current block
|
|
// newStart: index where the current block is to be moved
|
|
// (right after current end of already-compacted data)
|
|
int mapIndex, movedStart;
|
|
if (start == DATA_0800_OFFSET)
|
|
{
|
|
blockLength = DATA_BLOCK_LENGTH;
|
|
blockCount = 1;
|
|
}
|
|
|
|
// skip blocks that are not used
|
|
if (_map[start >> SHIFT_2] <= 0)
|
|
{
|
|
// advance start to the next block
|
|
start += blockLength;
|
|
|
|
// leave newStart with the previous block!
|
|
continue;
|
|
}
|
|
|
|
// search for an identical block
|
|
if ((movedStart = FindSameDataBlock(newStart, start, blockLength)) >= 0)
|
|
{
|
|
// found an identical block, set the other block's index value for the current block
|
|
mapIndex = start >> SHIFT_2;
|
|
for (i = blockCount; i > 0; i--)
|
|
{
|
|
_map[mapIndex++] = movedStart;
|
|
movedStart += DATA_BLOCK_LENGTH;
|
|
}
|
|
|
|
// advance start to the next block
|
|
start += blockLength;
|
|
|
|
// leave newStart with the previous block!
|
|
continue;
|
|
}
|
|
|
|
// see if the beginning of this block can be overlapped with the end of the previous block
|
|
// look for maximum overlap (modulo granularity) with the previous, adjacent block
|
|
var overlap = blockLength - DATA_GRANULARITY;
|
|
while ((overlap > 0) && !EqualSequence(_data, (newStart - overlap), start, overlap))
|
|
{
|
|
overlap -= DATA_GRANULARITY;
|
|
}
|
|
|
|
if ((overlap > 0) || (newStart < start))
|
|
{
|
|
// some overlap, or just move the whole block
|
|
movedStart = newStart - overlap;
|
|
mapIndex = start >> SHIFT_2;
|
|
|
|
for (i = blockCount; i > 0; i--)
|
|
{
|
|
_map[mapIndex++] = movedStart;
|
|
movedStart += DATA_BLOCK_LENGTH;
|
|
}
|
|
|
|
// move the non-overlapping indexes to their new positions
|
|
start += overlap;
|
|
for (i = blockLength - overlap; i > 0; i--)
|
|
{
|
|
_data[newStart++] = _data[start++];
|
|
}
|
|
|
|
}
|
|
else
|
|
{ // no overlap && newStart==start
|
|
mapIndex = start >> SHIFT_2;
|
|
for (i = blockCount; i > 0; i--)
|
|
{
|
|
_map[mapIndex++] = start;
|
|
start += DATA_BLOCK_LENGTH;
|
|
}
|
|
|
|
newStart = start;
|
|
}
|
|
}
|
|
|
|
// now adjust the index-2 table
|
|
i = 0;
|
|
while (i < _index2Length)
|
|
{
|
|
// Gap indexes are invalid (-1). Skip over the gap.
|
|
if (i == INDEX_GAP_OFFSET)
|
|
{
|
|
i += INDEX_GAP_LENGTH;
|
|
}
|
|
_index2[i] = _map[_index2[i] >> SHIFT_2];
|
|
++i;
|
|
}
|
|
|
|
_dataNullOffset = _map[_dataNullOffset >> SHIFT_2];
|
|
|
|
// ensure dataLength alignment
|
|
while ((newStart & (DATA_GRANULARITY - 1)) != 0)
|
|
{
|
|
_data[newStart++] = _initialValue;
|
|
}
|
|
_dataLength = newStart;
|
|
}
|
|
|
|
private void CompactIndex2()
|
|
{
|
|
// do not compact linear-BMP index-2 blocks
|
|
var newStart = INDEX_2_BMP_LENGTH;
|
|
var start = 0;
|
|
var i = 0;
|
|
|
|
while (start < newStart)
|
|
{
|
|
_map[i++] = start;
|
|
start += INDEX_2_BLOCK_LENGTH;
|
|
}
|
|
|
|
// Reduce the index table gap to what will be needed at runtime.
|
|
newStart += UTF8_2B_INDEX_2_LENGTH + ((_highStart - 0x10000) >> SHIFT_1);
|
|
start = INDEX_2_NULL_OFFSET;
|
|
while (start < _index2Length)
|
|
{
|
|
// start: index of first entry of current block
|
|
// newStart: index where the current block is to be moved
|
|
// (right after current end of already-compacted data)
|
|
|
|
// search for an identical block
|
|
int movedStart;
|
|
if ((movedStart = FindSameIndex2Block(newStart, start)) >= 0)
|
|
{
|
|
// found an identical block, set the other block's index value for the current block
|
|
_map[start >> SHIFT_1_2] = movedStart;
|
|
|
|
// advance start to the next block
|
|
start += INDEX_2_BLOCK_LENGTH;
|
|
|
|
// leave newStart with the previous block!
|
|
continue;
|
|
}
|
|
|
|
// see if the beginning of this block can be overlapped with the end of the previous block
|
|
// look for maximum overlap with the previous, adjacent block
|
|
var overlap = INDEX_2_BLOCK_LENGTH - 1;
|
|
while ((overlap > 0) && !EqualSequence(_index2, (newStart - overlap), start, overlap))
|
|
{
|
|
--overlap;
|
|
}
|
|
|
|
if ((overlap > 0) || (newStart < start))
|
|
{
|
|
// some overlap, or just move the whole block
|
|
_map[start >> SHIFT_1_2] = newStart - overlap;
|
|
|
|
// move the non-overlapping indexes to their new positions
|
|
start += overlap;
|
|
for (i = INDEX_2_BLOCK_LENGTH - overlap; i > 0; i--)
|
|
{
|
|
_index2[newStart++] = _index2[start++];
|
|
}
|
|
|
|
}
|
|
else
|
|
{ // no overlap && newStart==start
|
|
_map[start >> SHIFT_1_2] = start;
|
|
start += INDEX_2_BLOCK_LENGTH;
|
|
newStart = start;
|
|
}
|
|
}
|
|
|
|
// now adjust the index-1 table
|
|
for (i = 0; i < INDEX_1_LENGTH; i++)
|
|
{
|
|
_index1[i] = _map[_index1[i] >> SHIFT_1_2];
|
|
}
|
|
|
|
_index2NullOffset = _map[_index2NullOffset >> SHIFT_1_2];
|
|
|
|
// Ensure data table alignment:
|
|
// Needs to be granularity-aligned for 16-bit trie
|
|
// (so that dataMove will be down-shiftable),
|
|
// and 2-aligned for uint32_t data.
|
|
|
|
// Arbitrary value: 0x3fffc not possible for real data.
|
|
while ((newStart & ((DATA_GRANULARITY - 1) | 1)) != 0)
|
|
{
|
|
_index2[newStart++] = 0x0000ffff << INDEX_SHIFT;
|
|
}
|
|
|
|
_index2Length = newStart;
|
|
}
|
|
|
|
private void Compact()
|
|
{
|
|
// find highStart and round it up
|
|
var highValue = Get(0x10ffff);
|
|
var highStart = FindHighStart(highValue);
|
|
highStart = (highStart + (CP_PER_INDEX_1_ENTRY - 1)) & ~(CP_PER_INDEX_1_ENTRY - 1);
|
|
if (highStart == 0x110000)
|
|
{
|
|
highValue = _errorValue;
|
|
}
|
|
|
|
// Set trie->highStart only after utrie2_get32(trie, highStart).
|
|
// Otherwise utrie2_get32(trie, highStart) would try to read the highValue.
|
|
_highStart = highStart;
|
|
if (_highStart < 0x110000)
|
|
{
|
|
// Blank out [highStart..10ffff] to release associated data blocks.
|
|
var suppHighStart = _highStart <= 0x10000 ? 0x10000 : _highStart;
|
|
SetRange(suppHighStart, 0x10ffff, _initialValue);
|
|
}
|
|
|
|
CompactData();
|
|
|
|
if (_highStart > 0x10000)
|
|
{
|
|
CompactIndex2();
|
|
}
|
|
|
|
// Store the highValue in the data array and round up the dataLength.
|
|
// Must be done after compactData() because that assumes that dataLength
|
|
// is a multiple of DATA_BLOCK_LENGTH.
|
|
_data[_dataLength++] = highValue;
|
|
while ((_dataLength & (DATA_GRANULARITY - 1)) != 0)
|
|
{
|
|
_data[_dataLength++] = _initialValue;
|
|
}
|
|
|
|
_isCompacted = true;
|
|
}
|
|
|
|
private static bool EqualSequence(IReadOnlyList<uint> a, int s, int t, int length)
|
|
{
|
|
for (var i = 0; i < length; i++)
|
|
{
|
|
if (a[s + i] != a[t + i])
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
private static bool EqualSequence(IReadOnlyList<int> a, int s, int t, int length)
|
|
{
|
|
for (var i = 0; i < length; i++)
|
|
{
|
|
if (a[s + i] != a[t + i])
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
|