@ -4,57 +4,79 @@
// Licensed to The Avalonia Project under MIT License, courtesy of The .NET Foundation.
using System ;
using System.Runtime.InteropServices ;
namespace Avalonia.Media.TextFormatting.Unicode
{
public ref struct GraphemeEnumerator
{
private ReadOnlySpan < char > _ text ;
private readonly ReadOnlySpan < char > _ text ;
private int _ currentCodeUnitOffset ;
private int _ codeUnitLengthOfCurrentCodepoint ;
private Codepoint _ currentCodepoint ;
/// <summary>
/// Will be <see cref="GraphemeBreakClass.Other"/> if invalid data or EOF reached.
/// Caller shouldn't need to special-case this since the normal rules will halt on this condition.
/// </summary>
private GraphemeBreakClass _ currentType ;
public GraphemeEnumerator ( ReadOnlySpan < char > text )
{
_ text = text ;
Current = default ;
_ currentCodeUnitOffset = 0 ;
_ codeUnitLengthOfCurrentCodepoint = 0 ;
_ currentCodepoint = Codepoint . ReplacementCodepoint ;
_ currentType = GraphemeBreakClass . Other ;
}
/// <summary>
/// Gets the current <see cref="Grapheme"/>.
/// </summary>
public Grapheme Current { get ; private set ; }
/// <summary>
/// Moves to the next <see cref="Grapheme"/>.
/// </summary>
/// <returns></returns>
public bool MoveNext ( )
public bool MoveNext ( out Grapheme grapheme )
{
if ( _ text . IsEmpty )
var startOffset = _ currentCodeUnitOffset ;
if ( ( uint ) startOffset > = ( uint ) _ text . Length )
{
grapheme = default ;
return false ;
}
// Algorithm given at https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules.
var processor = new Processor ( _ text ) ;
processor . MoveNext ( ) ;
if ( startOffset = = 0 )
{
ReadNextCodepoint ( ) ;
}
var firstCodepoint = processor . C urrentCodepoint;
var firstCodepoint = _ c urrentCodepoint;
// First, consume as many Prepend scalars as we can (rule GB9b).
while ( processor . C urrentType = = GraphemeBreakClass . Prepend )
if ( _ c urrentType = = GraphemeBreakClass . Prepend )
{
processor . MoveNext ( ) ;
do
{
ReadNextCodepoint ( ) ;
} while ( _ currentType = = GraphemeBreakClass . Prepend ) ;
// There were only Prepend scalars in the text
if ( ( uint ) _ currentCodeUnitOffset > = ( uint ) _ text . Length )
{
goto Return ;
}
}
// Next, make sure we're not about to violate control character restrictions.
// Essentially, if we saw Prepend data, we can't have Control | CR | LF data afterward (rule GB5).
if ( processor . CurrentCodeUnitOffset > 0 )
if ( _ currentCodeUnitOffset > startOffset )
{
if ( processor . CurrentType = = GraphemeBreakClass . Control
| | processor . CurrentType = = GraphemeBreakClass . CR
| | processor . CurrentType = = GraphemeBreakClass . LF )
const uint controlCrLfMask =
( 1 U < < ( int ) GraphemeBreakClass . Control ) |
( 1 U < < ( int ) GraphemeBreakClass . CR ) |
( 1 U < < ( int ) GraphemeBreakClass . LF ) ;
if ( ( ( 1 U < < ( int ) _ currentType ) & controlCrLfMask ) ! = 0 U )
{
goto Return ;
}
@ -62,19 +84,19 @@ namespace Avalonia.Media.TextFormatting.Unicode
// Now begin the main state machine.
var previousClusterBreakType = processor . C urrentType;
var previousClusterBreakType = _ c urrentType;
processor . MoveNex t( ) ;
ReadNextCodepoin t( ) ;
switch ( previousClusterBreakType )
{
case GraphemeBreakClass . CR :
if ( processor . C urrentType ! = GraphemeBreakClass . LF )
if ( _ c urrentType ! = GraphemeBreakClass . LF )
{
goto Return ; // rules GB3 & GB4 (only <LF> can follow <CR>)
}
processor . MoveNex t( ) ;
ReadNextCodepoin t( ) ;
goto case GraphemeBreakClass . LF ;
case GraphemeBreakClass . Control :
@ -82,53 +104,57 @@ namespace Avalonia.Media.TextFormatting.Unicode
goto Return ; // rule GB4 (no data after Control | LF)
case GraphemeBreakClass . L :
if ( processor . CurrentType = = GraphemeBreakClass . L )
{
if ( _ currentType = = GraphemeBreakClass . L )
{
processor . MoveNex t( ) ; // rule GB6 (L x L)
ReadNextCodepoin t( ) ; // rule GB6 (L x L)
goto case GraphemeBreakClass . L ;
}
else if ( processor . C urrentType = = GraphemeBreakClass . V )
else if ( _ c urrentType = = GraphemeBreakClass . V )
{
processor . MoveNex t( ) ; // rule GB6 (L x V)
ReadNextCodepoin t( ) ; // rule GB6 (L x V)
goto case GraphemeBreakClass . V ;
}
else if ( processor . C urrentType = = GraphemeBreakClass . LV )
else if ( _ c urrentType = = GraphemeBreakClass . LV )
{
processor . MoveNex t( ) ; // rule GB6 (L x LV)
ReadNextCodepoin t( ) ; // rule GB6 (L x LV)
goto case GraphemeBreakClass . LV ;
}
else if ( processor . C urrentType = = GraphemeBreakClass . LVT )
else if ( _ c urrentType = = GraphemeBreakClass . LVT )
{
processor . MoveNex t( ) ; // rule GB6 (L x LVT)
ReadNextCodepoin t( ) ; // rule GB6 (L x LVT)
goto case GraphemeBreakClass . LVT ;
}
else
{
break ;
}
}
case GraphemeBreakClass . LV :
case GraphemeBreakClass . V :
if ( processor . CurrentType = = GraphemeBreakClass . V )
{
if ( _ currentType = = GraphemeBreakClass . V )
{
processor . MoveNex t( ) ; // rule GB7 (LV | V x V)
ReadNextCodepoin t( ) ; // rule GB7 (LV | V x V)
goto case GraphemeBreakClass . V ;
}
else if ( processor . C urrentType = = GraphemeBreakClass . T )
else if ( _ c urrentType = = GraphemeBreakClass . T )
{
processor . MoveNex t( ) ; // rule GB7 (LV | V x T)
ReadNextCodepoin t( ) ; // rule GB7 (LV | V x T)
goto case GraphemeBreakClass . T ;
}
else
{
break ;
}
}
case GraphemeBreakClass . LVT :
case GraphemeBreakClass . T :
if ( processor . C urrentType = = GraphemeBreakClass . T )
if ( _ c urrentType = = GraphemeBreakClass . T )
{
processor . MoveNex t( ) ; // rule GB8 (LVT | T x T)
ReadNextCodepoin t( ) ; // rule GB8 (LVT | T x T)
goto case GraphemeBreakClass . T ;
}
else
@ -139,123 +165,76 @@ namespace Avalonia.Media.TextFormatting.Unicode
case GraphemeBreakClass . ExtendedPictographic :
// Attempt processing extended pictographic (rules GB11, GB9).
// First, drain any Extend scalars that might exist
while ( processor . C urrentType = = GraphemeBreakClass . Extend )
while ( _ c urrentType = = GraphemeBreakClass . Extend )
{
processor . MoveNex t( ) ;
ReadNextCodepoin t( ) ;
}
// Now see if there's a ZWJ + extended pictograph again.
if ( processor . C urrentType ! = GraphemeBreakClass . ZWJ )
if ( _ c urrentType ! = GraphemeBreakClass . ZWJ )
{
break ;
}
processor . MoveNex t( ) ;
if ( processor . C urrentType ! = GraphemeBreakClass . ExtendedPictographic )
ReadNextCodepoin t( ) ;
if ( _ c urrentType ! = GraphemeBreakClass . ExtendedPictographic )
{
break ;
}
processor . MoveNex t( ) ;
ReadNextCodepoin t( ) ;
goto case GraphemeBreakClass . ExtendedPictographic ;
case GraphemeBreakClass . RegionalIndicator :
// We've consumed a single RI scalar. Try to consume another (to make it a pair).
if ( processor . C urrentType = = GraphemeBreakClass . RegionalIndicator )
if ( _ c urrentType = = GraphemeBreakClass . RegionalIndicator )
{
processor . MoveNex t( ) ;
ReadNextCodepoin t( ) ;
}
// Standlone RI scalars (or a single pair of RI scalars) can only be followed by trailers.
break ; // nothing but trailers after the final RI
default :
break ;
}
// rules GB9, GB9a
while ( processor . CurrentType = = GraphemeBreakClass . Extend
| | process or. CurrentType = = GraphemeBreakClass . ZWJ
| | process or. CurrentType = = GraphemeBreakClass . SpacingMark )
while ( _ currentType is GraphemeBreakClass . Extend
or GraphemeBreakClass . ZWJ
or GraphemeBreakClass . SpacingMark )
{
processor . MoveNex t( ) ;
ReadNextCodepoin t( ) ;
}
Return :
Current = new Grapheme ( firstCodepoint , _ text . Slice ( 0 , processor . CurrentCodeUnitOffset ) ) ;
_ text = _ text . Slice ( processor . CurrentCodeUnitOffset ) ;
var graphemeLength = _ currentCodeUnitOffset - startOffset ;
grapheme = new Grapheme ( firstCodepoint , startOffset , graphemeLength ) ;
return true ; // rules GB2, GB999
}
[StructLayout(LayoutKind.Auto)]
private ref struct Processor
private void ReadNextCodepoint ( )
{
private readonly ReadOnlySpan < char > _ buffer ;
private int _ codeUnitLengthOfCurrentScalar ;
internal Processor ( ReadOnlySpan < char > buffer )
{
_ buffer = buffer ;
_ codeUnitLengthOfCurrentScalar = 0 ;
CurrentCodepoint = Codepoint . ReplacementCodepoint ;
CurrentType = GraphemeBreakClass . Other ;
CurrentCodeUnitOffset = 0 ;
}
public int CurrentCodeUnitOffset { get ; private set ; }
/// <summary>
/// Will be <see cref="GraphemeBreakClass.Other"/> if invalid data or EOF reached.
/// Caller shouldn't need to special-case this since the normal rules will halt on this condition.
/// </summary>
public GraphemeBreakClass CurrentType { get ; private set ; }
/// <summary>
/// Get the currently processed <see cref="Codepoint"/>.
/// </summary>
public Codepoint CurrentCodepoint { get ; private set ; }
public void MoveNext ( )
{
// For ill-formed subsequences (like unpaired UTF-16 surrogate code points), we rely on
// the decoder's default behavior of interpreting these ill-formed subsequences as
// equivalent to U+FFFD REPLACEMENT CHARACTER. This code point has a boundary property
// of Other (XX), which matches the modifications made to UAX#29, Rev. 35.
// See: https://www.unicode.org/reports/tr29/tr29-35.html#Modifications
// This change is also reflected in the UCD files. For example, Unicode 11.0's UCD file
// https://www.unicode.org/Public/11.0.0/ucd/auxiliary/GraphemeBreakProperty.txt
// has the line "D800..DFFF ; Control # Cs [2048] <surrogate-D800>..<surrogate-DFFF>",
// but starting with Unicode 12.0 that line has been removed.
//
// If a later version of the Unicode Standard further modifies this guidance we should reflect
// that here.
if ( CurrentCodeUnitOffset = = _ buffer . Length )
{
CurrentCodepoint = Codepoint . ReplacementCodepoint ;
}
else
{
CurrentCodeUnitOffset + = _ codeUnitLengthOfCurrentScalar ;
if ( CurrentCodeUnitOffset < _ buffer . Length )
{
CurrentCodepoint = Codepoint . ReadAt ( _ buffer , CurrentCodeUnitOffset ,
out _ codeUnitLengthOfCurrentScalar ) ;
}
else
{
CurrentCodepoint = Codepoint . ReplacementCodepoint ;
}
}
CurrentType = CurrentCodepoint . GraphemeBreakClass ;
}
// For ill-formed subsequences (like unpaired UTF-16 surrogate code points), we rely on
// the decoder's default behavior of interpreting these ill-formed subsequences as
// equivalent to U+FFFD REPLACEMENT CHARACTER. This code point has a boundary property
// of Other (XX), which matches the modifications made to UAX#29, Rev. 35.
// See: https://www.unicode.org/reports/tr29/tr29-35.html#Modifications
// This change is also reflected in the UCD files. For example, Unicode 11.0's UCD file
// https://www.unicode.org/Public/11.0.0/ucd/auxiliary/GraphemeBreakProperty.txt
// has the line "D800..DFFF ; Control # Cs [2048] <surrogate-D800>..<surrogate-DFFF>",
// but starting with Unicode 12.0 that line has been removed.
//
// If a later version of the Unicode Standard further modifies this guidance we should reflect
// that here.
_ currentCodeUnitOffset + = _ codeUnitLengthOfCurrentCodepoint ;
_ currentCodepoint = Codepoint . ReadAt ( _ text , _ currentCodeUnitOffset ,
out _ codeUnitLengthOfCurrentCodepoint ) ;
_ currentType = _ currentCodepoint . GraphemeBreakClass ;
}
}
}