UnicodeUtil updates: TryUTF8toUTF16, ReadOnlySpan methods, #1024

apache · Dec 4, 2024 · 7581aa7 · 7581aa7
1 parent d597a5f
commit 7581aa7
Show file tree

Hide file tree

Showing 3 changed files with 152 additions and 41 deletions.
diff --git a/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs b/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs
@@ -50,15 +50,15 @@ namespace Lucene.Net.Codecs
     public static class BlockTreeTermsWriter
     {
         /// <summary>
-        /// Suggested default value for the 
-        /// <c>minItemsInBlock</c> parameter to 
+        /// Suggested default value for the
+        /// <c>minItemsInBlock</c> parameter to
         /// <see cref="BlockTreeTermsWriter{TSubclassState}(SegmentWriteState, PostingsWriterBase, int, int, TSubclassState)"/>.
         /// </summary>
         public const int DEFAULT_MIN_BLOCK_SIZE = 25;
 
         /// <summary>
-        /// Suggested default value for the 
-        /// <c>maxItemsInBlock</c> parameter to 
+        /// Suggested default value for the
+        /// <c>maxItemsInBlock</c> parameter to
         /// <see cref="BlockTreeTermsWriter{TSubclassState}(SegmentWriteState, PostingsWriterBase, int, int, TSubclassState)"/>.
         /// </summary>
         public const int DEFAULT_MAX_BLOCK_SIZE = 48;
@@ -296,12 +296,12 @@ public FieldMetaData(FieldInfo fieldInfo, BytesRef rootCode, long numTerms, long
         /// to set state. It is *optional* and can be used when overriding the WriteHeader(),
         /// WriteIndexHeader(). It only matters in the case where the state
         /// is required inside of any of those methods that is passed in to the subclass constructor.
-        /// 
+        ///
         /// When passed to the constructor, it is set to the protected field m_subclassState before
         /// any of the above methods are called where it is available for reading when overriding the above methods.
-        /// 
+        ///
         /// If your subclass needs to pass more than one piece of data, you can create a class or struct to do so.
-        /// All other virtual members of BlockTreeTermsWriter are not called in the constructor, 
+        /// All other virtual members of BlockTreeTermsWriter are not called in the constructor,
         /// so the overrides of those methods won't specifically need to use this field (although they could for consistency).
         /// </param>
         [SuppressMessage("CodeQuality", "IDE0079:Remove unnecessary suppression", Justification = "This is a SonarCloud issue")]
@@ -468,20 +468,32 @@ public PendingBlock(BytesRef prefix, long fp, bool hasTerms, bool isFloor, int f
 
             public override string ToString()
             {
-                return "BLOCK: " + Prefix.Utf8ToString();
+                return $"BLOCK: {Prefix.Utf8ToString()}";
+            }
+
+            #nullable enable
+            public bool TryToString([NotNullWhen(true)] out string? result)
+            {
+                if (Prefix.TryUtf8ToString(out string? prefixString))
+                {
+                    result = $"BLOCK: {prefixString}";
+                    return true;
+                }
+
+                result = null;
+                return false;
             }
 
             // LUCENENET specific - to keep the Debug.Assert statement from throwing exceptions
             // because of invalid UTF8 code in Prefix, we have a wrapper class that falls back
             // to using PendingBlock.Prefix.ToString() if PendingBlock.ToString() errors.
             // This struct defers formatting the string until it is actually used as a parameter
             // in string.Format().
-            private struct PendingBlocksFormatter // For assert
+            private readonly struct PendingBlocksFormatter // For assert
             {
-#pragma warning disable IDE0044 // Add readonly modifier
-                private IList<PendingBlock> blocks;
-#pragma warning restore IDE0044 // Add readonly modifier
-                public PendingBlocksFormatter(IList<PendingBlock> blocks)
+                private readonly IList<PendingBlock>? blocks;
+
+                public PendingBlocksFormatter(IList<PendingBlock>? blocks)
                 {
                     this.blocks = blocks; // May be null
                 }
@@ -500,17 +512,17 @@ public override string ToString() // For assert
                     it.MoveNext();
                     while (true)
                     {
-                        var e = it.Current;
+                        var e = it.Current ?? throw new InvalidOperationException("Expected a non-null value in the enumerator due to Count check above.");
                         // There is a chance that the Prefix will contain invalid UTF8,
                         // so we catch that and use the alternative way of displaying it
-                        try
+                        if (e.TryToString(out string? eString))
                         {
-                            sb.Append(e.ToString());
+                            sb.Append(eString);
                         }
-                        catch (IndexOutOfRangeException)
+                        else
                         {
                             sb.Append("BLOCK: ");
-                            sb.Append(e.Prefix.ToString());
+                            sb.Append(e.Prefix);
                         }
                         if (!it.MoveNext())
                         {
@@ -520,6 +532,7 @@ public override string ToString() // For assert
                     }
                 }
             }
+            #nullable restore
 
             public void CompileIndex(IList<PendingBlock> floorBlocks, RAMOutputStream scratchBytes)
             {
@@ -1351,4 +1364,4 @@ protected override void Dispose(bool disposing)
             }
         }
     }
-}
+}
diff --git a/src/Lucene.Net/Util/BytesRef.cs b/src/Lucene.Net/Util/BytesRef.cs
@@ -243,6 +243,26 @@ public string Utf8ToString()
             return @ref.ToString();
         }
 
+        #nullable enable
+        /// <summary>
+        /// Tries to interpret the stored bytes as UTF8 bytes, returning the
+        /// resulting <see cref="string"/> as an output parameter <paramref name="result"/>.
+        /// </summary>
+        /// <param name="result">The resulting string output.</param>
+        /// <returns><c>true</c> if successful, <c>false</c> otherwise.</returns>
+        public bool TryUtf8ToString([NotNullWhen(true)] out string? result)
+        {
+            if (UnicodeUtil.TryUTF8toUTF16(bytes, Offset, Length, out CharsRef? @ref))
+            {
+                result = @ref.ToString();
+                return true;
+            }
+
+            result = null;
+            return false;
+        }
+        #nullable restore
+
         /// <summary>
         /// Returns hex encoded bytes, eg [0x6c 0x75 0x63 0x65 0x6e 0x65] </summary>
         public override string ToString()

diff --git a/src/Lucene.Net/Util/UnicodeUtil.cs b/src/Lucene.Net/Util/UnicodeUtil.cs
@@ -3,6 +3,7 @@
 using Lucene.Net.Diagnostics;
 using Lucene.Net.Support;
 using System;
+using System.Diagnostics.CodeAnalysis;
 using System.Runtime.CompilerServices;
 using System.Text;
 
@@ -123,13 +124,13 @@ public static class UnicodeUtil
         private const int SURROGATE_OFFSET = Character.MinSupplementaryCodePoint - (UNI_SUR_HIGH_START << (int)HALF_SHIFT) - UNI_SUR_LOW_START;
 
         /// <summary>
-        /// Encode characters from a <see cref="T:char[]"/> <paramref name="source"/>, starting at
+        /// Encode characters from a <see cref="ReadOnlySpan{T}"/> (with generic type argument <see cref="char"/>) <paramref name="source"/>, starting at
         /// and ending at <paramref name="result"/>. After encoding, <c>result.Offset</c> will always be 0.
         /// </summary>
         /// <exception cref="ArgumentNullException"><paramref name="result"/> is <c>null</c>.</exception>
         // TODO: broken if incoming result.offset != 0
         // LUCENENET specific overload
-        public static void UTF16toUTF8(Span<char> source, BytesRef result)
+        public static void UTF16toUTF8(ReadOnlySpan<char> source, BytesRef result)
         {
             // LUCENENET: Added guard clause
             if (result is null)
@@ -200,7 +201,7 @@ public static void UTF16toUTF8(Span<char> source, BytesRef result)
         }
 
         /// <summary>
-        /// Encode characters from a <see cref="T:char[]"/> <paramref name="source"/>, starting at
+        /// Encode characters from a <see cref="ReadOnlySpan{T}"/> (with generic type argument <see cref="char"/>) <paramref name="source"/>, starting at
         /// <paramref name="offset"/> for <paramref name="length"/> chars. After encoding, <c>result.Offset</c> will always be 0.
         /// </summary>
         /// <exception cref="ArgumentNullException"><paramref name="source"/> or <paramref name="result"/> is <c>null</c>.</exception>
@@ -212,11 +213,9 @@ public static void UTF16toUTF8(Span<char> source, BytesRef result)
         /// <paramref name="offset"/> and <paramref name="length"/> refer to a location outside of <paramref name="source"/>.
         /// </exception>
         // TODO: broken if incoming result.offset != 0
-        public static void UTF16toUTF8(char[] source, int offset, int length, BytesRef result)
+        public static void UTF16toUTF8(ReadOnlySpan<char> source, int offset, int length, BytesRef result)
         {
             // LUCENENET: Added guard clauses
-            if (source is null)
-                throw new ArgumentNullException(nameof(source));
             if (result is null)
                 throw new ArgumentNullException(nameof(result));
             if (offset < 0)
@@ -633,7 +632,7 @@ public static bool ValidUTF16String(StringBuilder s) // LUCENENET specific overl
             return true;
         }
 
-        public static bool ValidUTF16String(char[] s, int size)
+        public static bool ValidUTF16String(ReadOnlySpan<char> s, int size)
         {
             for (int i = 0; i < size; i++)
             {
@@ -828,16 +827,16 @@ public static void UTF8toUTF32(BytesRef utf8, Int32sRef utf32)
         private const int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_);
 
         /// <summary>
-        /// Cover JDK 1.5 API. Create a String from an array of <paramref name="codePoints"/>.
+        /// Cover JDK 1.5 API. Create a String from a span of <paramref name="codePoints"/>.
         /// </summary>
-        /// <param name="codePoints"> The code array. </param>
-        /// <param name="offset"> The start of the text in the code point array. </param>
+        /// <param name="codePoints"> The code point span. </param>
+        /// <param name="offset"> The start of the text in the code point span. </param>
         /// <param name="count"> The number of code points. </param>
         /// <returns> a String representing the code points between offset and count. </returns>
         /// <exception cref="ArgumentException"> If an invalid code point is encountered. </exception>
         /// <exception cref="IndexOutOfRangeException"> If the offset or count are out of bounds. </exception>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static string NewString(int[] codePoints, int offset, int count)
+        public static string NewString(ReadOnlySpan<int> codePoints, int offset, int count)
         {
             // LUCENENET: Character.ToString() was optimized to use the stack for arrays
             // of codepoints 256 or less, so it performs better than using ToCharArray().
@@ -849,26 +848,26 @@ public static string NewString(int[] codePoints, int offset, int count)
         /// <para/>
         /// LUCENENET specific.
         /// </summary>
-        /// <param name="codePoints"> The code array. </param>
-        /// <param name="offset"> The start of the text in the code point array. </param>
+        /// <param name="codePoints"> The code span. </param>
+        /// <param name="offset"> The start of the text in the code point span. </param>
         /// <param name="count"> The number of code points. </param>
         /// <returns> a char array representing the code points between offset and count. </returns>
         // LUCENENET NOTE: This code was originally in the NewString() method (above).
         // It has been refactored from the original to remove the exception throw/catch and
-        // instead proactively resizes the array instead of relying on excpetions + copy operations
-        public static char[] ToCharArray(int[] codePoints, int offset, int count)
+        // instead proactively resizes the array instead of relying on exceptions + copy operations
+        public static char[] ToCharArray(ReadOnlySpan<int> codePoints, int offset, int count)
         {
             if (count < 0)
             {
                 throw new ArgumentOutOfRangeException(nameof(count), "count must be >= 0"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)
             }
-            const int countThreashold = 1024; // If the number of chars exceeds this, we count them instead of allocating count * 2
+            const int countThreshold = 1024; // If the number of chars exceeds this, we count them instead of allocating count * 2
             // LUCENENET: as a first approximation, assume each codepoint
             // is 2 characters (since it cannot be longer than this)
             int arrayLength = count * 2;
-            // LUCENENET: if we go over the threashold, count the number of
+            // LUCENENET: if we go over the threshold, count the number of
             // chars we will need so we can allocate the precise amount of memory
-            if (count > countThreashold)
+            if (count > countThreshold)
             {
                 arrayLength = 0;
                 for (int r = offset, e = offset + count; r < e; ++r)
@@ -951,15 +950,18 @@ public static string ToHexString(string s)
         }
 
         /// <summary>
-        /// Interprets the given byte array as UTF-8 and converts to UTF-16. The <see cref="CharsRef"/> will be extended if
+        /// Interprets the given byte span as UTF-8 and converts to UTF-16. The <see cref="CharsRef"/> will be extended if
         /// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint.
         /// <para/>
         /// NOTE: Full characters are read, even if this reads past the length passed (and
         /// can result in an <see cref="IndexOutOfRangeException"/> if invalid UTF-8 is passed).
         /// Explicit checks for valid UTF-8 are not performed.
         /// </summary>
+        /// <remarks>
+        /// LUCENENET specific: This method uses <see cref="ReadOnlySpan{T}"/> (with generic type argument <see cref="byte"/>) instead of byte[].
+        /// </remarks>
         // TODO: broken if chars.offset != 0
-        public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef chars)
+        public static void UTF8toUTF16(ReadOnlySpan<byte> utf8, int offset, int length, CharsRef chars)
         {
             int out_offset = chars.Offset = 0;
             char[] @out = chars.Chars = ArrayUtil.Grow(chars.Chars, length);
@@ -1001,9 +1003,85 @@ public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef cha
             chars.Length = out_offset - chars.Offset;
         }
 
+        #nullable enable
+        /// <summary>
+        /// Tries to interpret the given byte span as UTF-8 and convert to UTF-16, providing the result in a new <see cref="CharsRef"/>.
+        /// <para/>
+        /// NOTE: Explicit checks for valid UTF-8 are not performed.
+        /// </summary>
+        /// <remarks>
+        /// LUCENENET specific: This method uses <see cref="ReadOnlySpan{T}"/> (with generic type argument <see cref="byte"/>) instead of byte[].
+        /// </remarks>
+        public static bool TryUTF8toUTF16(ReadOnlySpan<byte> utf8, int offset, int length, [NotNullWhen(true)] out CharsRef? chars)
+        {
+            CharsRef result = new CharsRef(length);
+            int out_offset = 0;
+            char[] @out = result.Chars;
+            int limit = offset + length;
+            while (offset < limit)
+            {
+                if (utf8.Length <= offset)
+                {
+                    chars = null;
+                    return false;
+                }
+
+                int b = utf8[offset++] & 0xff;
+                if (b < 0xc0)
+                {
+                    if (Debugging.AssertsEnabled) Debugging.Assert(b < 0x80);
+                    @out[out_offset++] = (char)b;
+                }
+                else if (b < 0xe0)
+                {
+                    if (utf8.Length <= offset)
+                    {
+                        chars = null;
+                        return false;
+                    }
+                    @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[offset++] & 0x3f));
+                }
+                else if (b < 0xf0)
+                {
+                    if (utf8.Length <= offset + 1)
+                    {
+                        chars = null;
+                        return false;
+                    }
+                    @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[offset] & 0x3f) << 6) + (utf8[offset + 1] & 0x3f));
+                    offset += 2;
+                }
+                else
+                {
+                    if (utf8.Length <= offset + 2)
+                    {
+                        chars = null;
+                        return false;
+                    }
+                    if (Debugging.AssertsEnabled) Debugging.Assert(b < 0xf8, "b = 0x{0:x}", b);
+                    int ch = ((b & 0x7) << 18) + ((utf8[offset] & 0x3f) << 12) + ((utf8[offset + 1] & 0x3f) << 6) + (utf8[offset + 2] & 0x3f);
+                    offset += 3;
+                    if (ch < UNI_MAX_BMP)
+                    {
+                        @out[out_offset++] = (char)ch;
+                    }
+                    else
+                    {
+                        int chHalf = ch - 0x0010000;
+                        @out[out_offset++] = (char)((chHalf >> 10) + 0xD800);
+                        @out[out_offset++] = (char)((chHalf & HALF_MASK) + 0xDC00);
+                    }
+                }
+            }
+            result.Length = out_offset;
+            chars = result;
+            return true;
+        }
+        #nullable restore
+
         /// <summary>
-        /// Utility method for <see cref="UTF8toUTF16(byte[], int, int, CharsRef)"/> </summary>
-        /// <seealso cref="UTF8toUTF16(byte[], int, int, CharsRef)"/>
+        /// Utility method for <see cref="UTF8toUTF16(ReadOnlySpan{byte}, int, int, CharsRef)"/> </summary>
+        /// <seealso cref="UTF8toUTF16(ReadOnlySpan{byte}, int, int, CharsRef)"/>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static void UTF8toUTF16(BytesRef bytesRef, CharsRef chars)
         {