Skip to content

Commit

Permalink
UnicodeUtil updates: TryUTF8toUTF16, ReadOnlySpan methods, #1024
Browse files Browse the repository at this point in the history
  • Loading branch information
paulirwin committed Dec 4, 2024
1 parent d597a5f commit 7581aa7
Show file tree
Hide file tree
Showing 3 changed files with 152 additions and 41 deletions.
51 changes: 32 additions & 19 deletions src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -50,15 +50,15 @@ namespace Lucene.Net.Codecs
public static class BlockTreeTermsWriter
{
/// <summary>
/// Suggested default value for the
/// <c>minItemsInBlock</c> parameter to
/// Suggested default value for the
/// <c>minItemsInBlock</c> parameter to
/// <see cref="BlockTreeTermsWriter{TSubclassState}(SegmentWriteState, PostingsWriterBase, int, int, TSubclassState)"/>.
/// </summary>
public const int DEFAULT_MIN_BLOCK_SIZE = 25;

/// <summary>
/// Suggested default value for the
/// <c>maxItemsInBlock</c> parameter to
/// Suggested default value for the
/// <c>maxItemsInBlock</c> parameter to
/// <see cref="BlockTreeTermsWriter{TSubclassState}(SegmentWriteState, PostingsWriterBase, int, int, TSubclassState)"/>.
/// </summary>
public const int DEFAULT_MAX_BLOCK_SIZE = 48;
Expand Down Expand Up @@ -296,12 +296,12 @@ public FieldMetaData(FieldInfo fieldInfo, BytesRef rootCode, long numTerms, long
/// to set state. It is *optional* and can be used when overriding the WriteHeader(),
/// WriteIndexHeader(). It only matters in the case where the state
/// is required inside of any of those methods that is passed in to the subclass constructor.
///
///
/// When passed to the constructor, it is set to the protected field m_subclassState before
/// any of the above methods are called where it is available for reading when overriding the above methods.
///
///
/// If your subclass needs to pass more than one piece of data, you can create a class or struct to do so.
/// All other virtual members of BlockTreeTermsWriter are not called in the constructor,
/// All other virtual members of BlockTreeTermsWriter are not called in the constructor,
/// so the overrides of those methods won't specifically need to use this field (although they could for consistency).
/// </param>
[SuppressMessage("CodeQuality", "IDE0079:Remove unnecessary suppression", Justification = "This is a SonarCloud issue")]
Expand Down Expand Up @@ -468,20 +468,32 @@ public PendingBlock(BytesRef prefix, long fp, bool hasTerms, bool isFloor, int f

public override string ToString()
{
return "BLOCK: " + Prefix.Utf8ToString();
return $"BLOCK: {Prefix.Utf8ToString()}";
}

#nullable enable
public bool TryToString([NotNullWhen(true)] out string? result)
{
if (Prefix.TryUtf8ToString(out string? prefixString))
{
result = $"BLOCK: {prefixString}";
return true;
}

result = null;
return false;
}

// LUCENENET specific - to keep the Debug.Assert statement from throwing exceptions
// because of invalid UTF8 code in Prefix, we have a wrapper class that falls back
// to using PendingBlock.Prefix.ToString() if PendingBlock.ToString() errors.
// This struct defers formatting the string until it is actually used as a parameter
// in string.Format().
private struct PendingBlocksFormatter // For assert
private readonly struct PendingBlocksFormatter // For assert
{
#pragma warning disable IDE0044 // Add readonly modifier
private IList<PendingBlock> blocks;
#pragma warning restore IDE0044 // Add readonly modifier
public PendingBlocksFormatter(IList<PendingBlock> blocks)
private readonly IList<PendingBlock>? blocks;

public PendingBlocksFormatter(IList<PendingBlock>? blocks)
{
this.blocks = blocks; // May be null
}
Expand All @@ -500,17 +512,17 @@ public override string ToString() // For assert
it.MoveNext();
while (true)
{
var e = it.Current;
var e = it.Current ?? throw new InvalidOperationException("Expected a non-null value in the enumerator due to Count check above.");
// There is a chance that the Prefix will contain invalid UTF8,
// so we catch that and use the alternative way of displaying it
try
if (e.TryToString(out string? eString))
{
sb.Append(e.ToString());
sb.Append(eString);
}
catch (IndexOutOfRangeException)
else
{
sb.Append("BLOCK: ");
sb.Append(e.Prefix.ToString());
sb.Append(e.Prefix);
}
if (!it.MoveNext())
{
Expand All @@ -520,6 +532,7 @@ public override string ToString() // For assert
}
}
}
#nullable restore

public void CompileIndex(IList<PendingBlock> floorBlocks, RAMOutputStream scratchBytes)
{
Expand Down Expand Up @@ -1351,4 +1364,4 @@ protected override void Dispose(bool disposing)
}
}
}
}
}
20 changes: 20 additions & 0 deletions src/Lucene.Net/Util/BytesRef.cs
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,26 @@ public string Utf8ToString()
return @ref.ToString();
}

#nullable enable
/// <summary>
/// Tries to interpret the stored bytes as UTF8 bytes, returning the
/// resulting <see cref="string"/> as an output parameter <paramref name="result"/>.
/// </summary>
/// <param name="result">The resulting string output.</param>
/// <returns><c>true</c> if successful, <c>false</c> otherwise.</returns>
public bool TryUtf8ToString([NotNullWhen(true)] out string? result)
{
if (UnicodeUtil.TryUTF8toUTF16(bytes, Offset, Length, out CharsRef? @ref))
{
result = @ref.ToString();
return true;
}

result = null;
return false;
}
#nullable restore

/// <summary>
/// Returns hex encoded bytes, eg [0x6c 0x75 0x63 0x65 0x6e 0x65] </summary>
public override string ToString()
Expand Down
122 changes: 100 additions & 22 deletions src/Lucene.Net/Util/UnicodeUtil.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
using Lucene.Net.Diagnostics;
using Lucene.Net.Support;
using System;
using System.Diagnostics.CodeAnalysis;
using System.Runtime.CompilerServices;
using System.Text;

Expand Down Expand Up @@ -123,13 +124,13 @@ public static class UnicodeUtil
private const int SURROGATE_OFFSET = Character.MinSupplementaryCodePoint - (UNI_SUR_HIGH_START << (int)HALF_SHIFT) - UNI_SUR_LOW_START;

/// <summary>
/// Encode characters from a <see cref="T:char[]"/> <paramref name="source"/>, starting at
/// Encode characters from a <see cref="ReadOnlySpan{T}"/> (with generic type argument <see cref="char"/>) <paramref name="source"/>, starting at
/// and ending at <paramref name="result"/>. After encoding, <c>result.Offset</c> will always be 0.
/// </summary>
/// <exception cref="ArgumentNullException"><paramref name="result"/> is <c>null</c>.</exception>
// TODO: broken if incoming result.offset != 0
// LUCENENET specific overload
public static void UTF16toUTF8(Span<char> source, BytesRef result)
public static void UTF16toUTF8(ReadOnlySpan<char> source, BytesRef result)
{
// LUCENENET: Added guard clause
if (result is null)
Expand Down Expand Up @@ -200,7 +201,7 @@ public static void UTF16toUTF8(Span<char> source, BytesRef result)
}

/// <summary>
/// Encode characters from a <see cref="T:char[]"/> <paramref name="source"/>, starting at
/// Encode characters from a <see cref="ReadOnlySpan{T}"/> (with generic type argument <see cref="char"/>) <paramref name="source"/>, starting at
/// <paramref name="offset"/> for <paramref name="length"/> chars. After encoding, <c>result.Offset</c> will always be 0.
/// </summary>
/// <exception cref="ArgumentNullException"><paramref name="source"/> or <paramref name="result"/> is <c>null</c>.</exception>
Expand All @@ -212,11 +213,9 @@ public static void UTF16toUTF8(Span<char> source, BytesRef result)
/// <paramref name="offset"/> and <paramref name="length"/> refer to a location outside of <paramref name="source"/>.
/// </exception>
// TODO: broken if incoming result.offset != 0
public static void UTF16toUTF8(char[] source, int offset, int length, BytesRef result)
public static void UTF16toUTF8(ReadOnlySpan<char> source, int offset, int length, BytesRef result)
{
// LUCENENET: Added guard clauses
if (source is null)
throw new ArgumentNullException(nameof(source));
if (result is null)
throw new ArgumentNullException(nameof(result));
if (offset < 0)
Expand Down Expand Up @@ -633,7 +632,7 @@ public static bool ValidUTF16String(StringBuilder s) // LUCENENET specific overl
return true;
}

public static bool ValidUTF16String(char[] s, int size)
public static bool ValidUTF16String(ReadOnlySpan<char> s, int size)
{
for (int i = 0; i < size; i++)
{
Expand Down Expand Up @@ -828,16 +827,16 @@ public static void UTF8toUTF32(BytesRef utf8, Int32sRef utf32)
private const int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_);

/// <summary>
/// Cover JDK 1.5 API. Create a String from an array of <paramref name="codePoints"/>.
/// Cover JDK 1.5 API. Create a String from a span of <paramref name="codePoints"/>.
/// </summary>
/// <param name="codePoints"> The code array. </param>
/// <param name="offset"> The start of the text in the code point array. </param>
/// <param name="codePoints"> The code point span. </param>
/// <param name="offset"> The start of the text in the code point span. </param>
/// <param name="count"> The number of code points. </param>
/// <returns> a String representing the code points between offset and count. </returns>
/// <exception cref="ArgumentException"> If an invalid code point is encountered. </exception>
/// <exception cref="IndexOutOfRangeException"> If the offset or count are out of bounds. </exception>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static string NewString(int[] codePoints, int offset, int count)
public static string NewString(ReadOnlySpan<int> codePoints, int offset, int count)
{
// LUCENENET: Character.ToString() was optimized to use the stack for arrays
// of codepoints 256 or less, so it performs better than using ToCharArray().
Expand All @@ -849,26 +848,26 @@ public static string NewString(int[] codePoints, int offset, int count)
/// <para/>
/// LUCENENET specific.
/// </summary>
/// <param name="codePoints"> The code array. </param>
/// <param name="offset"> The start of the text in the code point array. </param>
/// <param name="codePoints"> The code span. </param>
/// <param name="offset"> The start of the text in the code point span. </param>
/// <param name="count"> The number of code points. </param>
/// <returns> a char array representing the code points between offset and count. </returns>
// LUCENENET NOTE: This code was originally in the NewString() method (above).
// It has been refactored from the original to remove the exception throw/catch and
// instead proactively resizes the array instead of relying on excpetions + copy operations
public static char[] ToCharArray(int[] codePoints, int offset, int count)
// instead proactively resizes the array instead of relying on exceptions + copy operations
public static char[] ToCharArray(ReadOnlySpan<int> codePoints, int offset, int count)
{
if (count < 0)
{
throw new ArgumentOutOfRangeException(nameof(count), "count must be >= 0"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)
}
const int countThreashold = 1024; // If the number of chars exceeds this, we count them instead of allocating count * 2
const int countThreshold = 1024; // If the number of chars exceeds this, we count them instead of allocating count * 2
// LUCENENET: as a first approximation, assume each codepoint
// is 2 characters (since it cannot be longer than this)
int arrayLength = count * 2;
// LUCENENET: if we go over the threashold, count the number of
// LUCENENET: if we go over the threshold, count the number of
// chars we will need so we can allocate the precise amount of memory
if (count > countThreashold)
if (count > countThreshold)
{
arrayLength = 0;
for (int r = offset, e = offset + count; r < e; ++r)
Expand Down Expand Up @@ -951,15 +950,18 @@ public static string ToHexString(string s)
}

/// <summary>
/// Interprets the given byte array as UTF-8 and converts to UTF-16. The <see cref="CharsRef"/> will be extended if
/// Interprets the given byte span as UTF-8 and converts to UTF-16. The <see cref="CharsRef"/> will be extended if
/// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint.
/// <para/>
/// NOTE: Full characters are read, even if this reads past the length passed (and
/// can result in an <see cref="IndexOutOfRangeException"/> if invalid UTF-8 is passed).
/// Explicit checks for valid UTF-8 are not performed.
/// </summary>
/// <remarks>
/// LUCENENET specific: This method uses <see cref="ReadOnlySpan{T}"/> (with generic type argument <see cref="byte"/>) instead of byte[].
/// </remarks>
// TODO: broken if chars.offset != 0
public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef chars)
public static void UTF8toUTF16(ReadOnlySpan<byte> utf8, int offset, int length, CharsRef chars)
{
int out_offset = chars.Offset = 0;
char[] @out = chars.Chars = ArrayUtil.Grow(chars.Chars, length);
Expand Down Expand Up @@ -1001,9 +1003,85 @@ public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef cha
chars.Length = out_offset - chars.Offset;
}

#nullable enable
/// <summary>
/// Tries to interpret the given byte span as UTF-8 and convert to UTF-16, providing the result in a new <see cref="CharsRef"/>.
/// <para/>
/// NOTE: Explicit checks for valid UTF-8 are not performed.
/// </summary>
/// <remarks>
/// LUCENENET specific: This method uses <see cref="ReadOnlySpan{T}"/> (with generic type argument <see cref="byte"/>) instead of byte[].
/// </remarks>
public static bool TryUTF8toUTF16(ReadOnlySpan<byte> utf8, int offset, int length, [NotNullWhen(true)] out CharsRef? chars)
{
CharsRef result = new CharsRef(length);
int out_offset = 0;
char[] @out = result.Chars;
int limit = offset + length;
while (offset < limit)
{
if (utf8.Length <= offset)
{
chars = null;
return false;
}

int b = utf8[offset++] & 0xff;
if (b < 0xc0)
{
if (Debugging.AssertsEnabled) Debugging.Assert(b < 0x80);
@out[out_offset++] = (char)b;
}
else if (b < 0xe0)
{
if (utf8.Length <= offset)
{
chars = null;
return false;
}
@out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[offset++] & 0x3f));
}
else if (b < 0xf0)
{
if (utf8.Length <= offset + 1)
{
chars = null;
return false;
}
@out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[offset] & 0x3f) << 6) + (utf8[offset + 1] & 0x3f));
offset += 2;
}
else
{
if (utf8.Length <= offset + 2)
{
chars = null;
return false;
}
if (Debugging.AssertsEnabled) Debugging.Assert(b < 0xf8, "b = 0x{0:x}", b);
int ch = ((b & 0x7) << 18) + ((utf8[offset] & 0x3f) << 12) + ((utf8[offset + 1] & 0x3f) << 6) + (utf8[offset + 2] & 0x3f);
offset += 3;
if (ch < UNI_MAX_BMP)
{
@out[out_offset++] = (char)ch;
}
else
{
int chHalf = ch - 0x0010000;
@out[out_offset++] = (char)((chHalf >> 10) + 0xD800);
@out[out_offset++] = (char)((chHalf & HALF_MASK) + 0xDC00);
}
}
}
result.Length = out_offset;
chars = result;
return true;
}
#nullable restore

/// <summary>
/// Utility method for <see cref="UTF8toUTF16(byte[], int, int, CharsRef)"/> </summary>
/// <seealso cref="UTF8toUTF16(byte[], int, int, CharsRef)"/>
/// Utility method for <see cref="UTF8toUTF16(ReadOnlySpan{byte}, int, int, CharsRef)"/> </summary>
/// <seealso cref="UTF8toUTF16(ReadOnlySpan{byte}, int, int, CharsRef)"/>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static void UTF8toUTF16(BytesRef bytesRef, CharsRef chars)
{
Expand Down

0 comments on commit 7581aa7

Please sign in to comment.