Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speed up KeyAnalyzer for substring based frozen collections #89689

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ The System.Collections.Immutable library is built-in as part of the shared frame

<ItemGroup>
<Compile Include="Properties\InternalsVisibleTo.cs" />
<Compile Include="System\Collections\Frozen\String\SubstringEquality\SubstringEqualityComparers.cs" />
<Compile Include="System\Collections\Frozen\String\SubstringEquality\SubstringEqualityComparerBase.cs" />

<Compile Include="System\Polyfills.cs" />
<Compile Include="System\Collections\ThrowHelper.cs" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,10 @@
// The .NET Foundation licenses this file to you under the MIT license.

using System.Buffers;
using System.Collections.Frozen.String.SubstringEquality;
using System.Collections.Generic;
using System.Diagnostics;
#if !NET8_0_OR_GREATER
using System.Runtime.CompilerServices;
#endif

namespace System.Collections.Frozen
{
Expand All @@ -33,84 +32,83 @@ public static AnalysisResults Analyze(
{
Debug.Assert(!uniqueStrings.IsEmpty);

// Try to pick a substring comparer. If we can't find a good substring comparer, fallback to a full string comparer.
AnalysisResults results;
if (minLength == 0 || !TryUseSubstring(uniqueStrings, ignoreCase, minLength, maxLength, out results))
if (minLength > 0)
{
results = CreateAnalysisResults(uniqueStrings, ignoreCase, minLength, maxLength, 0, 0, static (s, _, _) => s.AsSpan());
}
const int MaxSubstringLengthLimit = 8; // arbitrary small-ish limit...it's not worth the increase in algorithmic complexity to analyze longer substrings
int uniqueStringsLength = uniqueStrings.Length;

return results;
}

/// <summary>Try to find the minimal unique substring index/length to use for comparisons.</summary>
private static bool TryUseSubstring(ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength, out AnalysisResults results)
{
const int MaxSubstringLengthLimit = 8; // arbitrary small-ish limit... t's not worth the increase in algorithmic complexity to analyze longer substrings
// Sufficient uniqueness factor of 95% is good enough.
// Instead of ensuring that 95% of data is good, we stop when we know that at least 5% is bad.
int acceptableNonUniqueCount = uniqueStringsLength / 20;

SubstringComparer comparer = ignoreCase ? new JustifiedCaseInsensitiveSubstringComparer() : new JustifiedSubstringComparer();
HashSet<string> set = new HashSet<string>(
#if NET6_0_OR_GREATER
uniqueStrings.Length,
#endif
comparer);
ISubstringEqualityComparer leftComparer = ignoreCase ? new LeftSubstringCaseInsensitiveComparer() : new LeftSubstringOrdinalComparer();
HashSet<string> leftSet = MakeHashSet(uniqueStringsLength, leftComparer);

// For each substring length...
int maxSubstringLength = Math.Min(minLength, MaxSubstringLengthLimit);
for (int count = 1; count <= maxSubstringLength; count++)
{
comparer.IsLeft = true;
comparer.Count = count;
// we lazily spin up the right comparators when/if needed
ISubstringEqualityComparer? rightComparer = null;
HashSet<string>? rightSet = null;

// For each index, get a uniqueness factor for the left-justified substrings.
// If any is above our threshold, we're done.
for (int index = 0; index <= minLength - count; index++)
// For each substring length...preferring the shortest length that provides
// enough uniqueness
int maxSubstringLength = Math.Min(minLength, MaxSubstringLengthLimit);
for (int count = 1; count <= maxSubstringLength; count++)
{
comparer.Index = index;
leftComparer.Count = count;

if (HasSufficientUniquenessFactor(set, uniqueStrings))
// For each index, get a uniqueness factor for the left-justified substrings.
// If any is above our threshold, we're done.
for (int index = 0; index <= minLength - count; index++)
{
results = CreateAnalysisResults(
uniqueStrings, ignoreCase, minLength, maxLength, index, count,
static (string s, int index, int count) => s.AsSpan(index, count));
return true;
}
}
leftComparer.Index = index;

// There were no left-justified substrings of this length available.
// If all of the strings are of the same length, then just checking left-justification is sufficient.
// But if any strings are of different lengths, then we'll get different alignments for left- vs
// right-justified substrings, and so we also check right-justification.
if (minLength != maxLength)
{
// toggle the direction and re-use the comparer and hashset (HasSufficientUniquenessFactor clears it)
comparer.IsLeft = false;
if (HasSufficientUniquenessFactor(leftSet, uniqueStrings, acceptableNonUniqueCount))
{
return CreateAnalysisResults(uniqueStrings, ignoreCase, minLength, maxLength, leftComparer);
}
}

// For each index, get a uniqueness factor for the right-justified substrings.
// If any is above our threshold, we're done.
for (int index = 0; index <= minLength - count; index++)
// There were no left-justified substrings of this length available.
// If all of the strings are of the same length, then just checking left-justification is sufficient.
// But if any strings are of different lengths, then we'll get different alignments for left- vs
// right-justified substrings, and so we also check right-justification.
if (minLength != maxLength)
{
// Get a uniqueness factor for the right-justified substrings.
// If it's above our threshold, we're done.
comparer.Index = -index - count;
if (HasSufficientUniquenessFactor(set, uniqueStrings))
rightComparer ??= ignoreCase ? new RightSubstringCaseInsensitiveComparer() : new RightSubstringOrdinalComparer();
rightSet ??= MakeHashSet(uniqueStringsLength, rightComparer);

// when Index is negative, we're offsetting from the right, ensure we're at least
// far enough from the right that we have count characters available
rightComparer!.Count = count;
rightComparer!.Index = -count;

// For each index, get a uniqueness factor for the right-justified substrings.
// If any is above our threshold, we're done.
for (int offset = 0; offset <= minLength - count; offset++, rightComparer!.Index--)
{
results = CreateAnalysisResults(
uniqueStrings, ignoreCase, minLength, maxLength, comparer.Index, count,
static (string s, int index, int count) => s.AsSpan(s.Length + index, count));
return true;
if (HasSufficientUniquenessFactor(rightSet!, uniqueStrings, acceptableNonUniqueCount))
{
return CreateAnalysisResults(uniqueStrings, ignoreCase, minLength, maxLength, rightComparer);
}
}
}
}
}

// Could not find a substring index/length that was good enough.
results = default;
return false;
// Could not find a substring index/length that was good enough, use the entire string.
return CreateAnalysisResults(uniqueStrings, ignoreCase, minLength, maxLength, s_FullComparer);
}

private static HashSet<string> MakeHashSet(int length, IEqualityComparer<string> comparer)
{
return new HashSet<string>(
#if NET6_0_OR_GREATER
length,
#endif
comparer);
}

private static AnalysisResults CreateAnalysisResults(
ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength, int index, int count, GetSpan getSubstringSpan)
ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength, ISubstringEqualityComparer comparer)
{
// Start off by assuming all strings are ASCII
bool allAsciiIfIgnoreCase = true;
Expand All @@ -129,7 +127,7 @@ private static AnalysisResults CreateAnalysisResults(
foreach (string s in uniqueStrings)
{
// Get the span for the substring.
ReadOnlySpan<char> substring = getSubstringSpan(s, index, count);
ReadOnlySpan<char> substring = comparer.Slice(s);

// If the substring isn't ASCII, bail out to return the results.
if (!IsAllAscii(substring))
Expand All @@ -155,11 +153,9 @@ private static AnalysisResults CreateAnalysisResults(
}

// Return the analysis results.
return new AnalysisResults(ignoreCase, allAsciiIfIgnoreCase, index, count, minLength, maxLength);
return new AnalysisResults(ignoreCase, allAsciiIfIgnoreCase, comparer.Index, comparer.Count, minLength, maxLength);
}

private delegate ReadOnlySpan<char> GetSpan(string s, int index, int count);

internal static unsafe bool IsAllAscii(ReadOnlySpan<char> s)
{
#if NET8_0_OR_GREATER
Expand Down Expand Up @@ -202,7 +198,7 @@ internal static unsafe bool IsAllAscii(ReadOnlySpan<char> s)
#if NET8_0_OR_GREATER
private static readonly SearchValues<char> s_asciiLetters = SearchValues.Create("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
#endif
private static bool ContainsAnyLetters(ReadOnlySpan<char> s)
internal static bool ContainsAnyLetters(ReadOnlySpan<char> s)
{
Debug.Assert(IsAllAscii(s));

Expand All @@ -221,18 +217,13 @@ private static bool ContainsAnyLetters(ReadOnlySpan<char> s)
#endif
}

private static bool HasSufficientUniquenessFactor(HashSet<string> set, ReadOnlySpan<string> uniqueStrings)
internal static bool HasSufficientUniquenessFactor(HashSet<string> set, ReadOnlySpan<string> uniqueStrings, int acceptableNonUniqueCount)
{
set.Clear();

// Sufficient uniqueness factor of 95% is good enough.
// Instead of ensuring that 95% of data is good, we stop when we know that at least 5% is bad.
int acceptableNonUniqueCount = uniqueStrings.Length / 20;

foreach (string s in uniqueStrings)
{
if (!set.Add(s) && --acceptableNonUniqueCount < 0)
{
set.Clear();
return false;
}
}
Expand Down Expand Up @@ -263,25 +254,6 @@ public AnalysisResults(bool ignoreCase, bool allAsciiIfIgnoreCase, int hashIndex
public bool RightJustifiedSubstring => HashIndex < 0;
}

private abstract class SubstringComparer : IEqualityComparer<string>
{
public int Index;
public int Count;
public bool IsLeft;
public abstract bool Equals(string? x, string? y);
public abstract int GetHashCode(string s);
}

private sealed class JustifiedSubstringComparer : SubstringComparer
{
public override bool Equals(string? x, string? y) => x.AsSpan(IsLeft ? Index : (x!.Length + Index), Count).SequenceEqual(y.AsSpan(IsLeft ? Index : (y!.Length + Index), Count));
public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinal(s.AsSpan(IsLeft ? Index : (s.Length + Index), Count));
}

private sealed class JustifiedCaseInsensitiveSubstringComparer : SubstringComparer
{
public override bool Equals(string? x, string? y) => x.AsSpan(IsLeft ? Index : (x!.Length + Index), Count).Equals(y.AsSpan(IsLeft ? Index : (y!.Length + Index), Count), StringComparison.OrdinalIgnoreCase);
public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCase(s.AsSpan(IsLeft ? Index : (s.Length + Index), Count));
}
private static FullStringEqualityComparer s_FullComparer = new FullStringEqualityComparer();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Collections.Generic;
using System.Runtime.CompilerServices;

namespace System.Collections.Frozen.String.SubstringEquality
{
internal interface ISubstringEqualityComparer : IEqualityComparer<string>
{
/// <summary>
/// The index at which to begin this slice
/// </summary>
/// <remarks>Offset from the left side (if zero or positive) or right side (if negative)</remarks>
public int Index { get; set; }

/// <summary>
/// The desired length for the slice (exclusive).
/// </summary>
public int Count { get; set; }

/// <summary>
/// Creates a new readonly span over the portion of the target string.
/// </summary>
/// <param name="s">The target string.</param>
/// <exception cref="ArgumentNullException"><paramref name="s"/> is null.</exception>
/// <exception cref="ArgumentOutOfRangeException">
/// Thrown when the specified Index or Count is not in range.
/// </exception>
public abstract ReadOnlySpan<char> Slice(string s);
}

internal abstract class SubstringEqualityComparerBase<TThisWrapper> : ISubstringEqualityComparer
where TThisWrapper : struct, SubstringEqualityComparerBase<TThisWrapper>.IGenericSpecializedWrapper
{
/// <summary>A wrapper around this that enables access to important members without making virtual calls.</summary>
private readonly TThisWrapper _this;

protected SubstringEqualityComparerBase()
{
_this = default;
_this.Store(this);
}

/// <inheritdoc />
public int Index { get; set; }
/// <inheritdoc />
public int Count { get; set; }

/// <inheritdoc />
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public ReadOnlySpan<char> Slice(string s) => _this.Slice(s);

/// <inheritdoc />
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public bool Equals(string? x, string? y) => _this.Equals(x, y);

/// <inheritdoc />
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public int GetHashCode(string s) => _this.GetHashCode(s);

/// <summary>Used to enable generic specialization with reference types.</summary>
/// <remarks>
/// To avoid each of those incurring virtual dispatch to the derived type, the derived
/// type hands down a struct wrapper through which all calls are performed. This base
/// class uses that generic struct wrapper to specialize and de-virtualize.
/// </remarks>
internal interface IGenericSpecializedWrapper
{
void Store(ISubstringEqualityComparer @this);
public ReadOnlySpan<char> Slice(string s);
public bool Equals(string? x, string? y);
public int GetHashCode(string s);
}
}
}
Loading