Skip to content

Commit

Permalink
Add RegexCache for EnumerateMatches (#137)
Browse files Browse the repository at this point in the history
  • Loading branch information
SimonCropp authored Feb 20, 2024
1 parent 6676058 commit c86ed61
Show file tree
Hide file tree
Showing 3 changed files with 293 additions and 3 deletions.
2 changes: 1 addition & 1 deletion src/Directory.Build.props
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<?xml version="1.0" encoding="utf-8"?>
<Project>
<PropertyGroup>
<Version>2.6.0</Version>
<Version>2.6.1</Version>
<AssemblyVersion>1.0.0</AssemblyVersion>
<PackageTags>Polyfill</PackageTags>
<DisableImplicitNamespaceImports>true</DisableImplicitNamespaceImports>
Expand Down
290 changes: 290 additions & 0 deletions src/Polyfill/Regex/RegexCache.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,290 @@
// <auto-generated />
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
#pragma warning disable

#if !NET7_0_OR_GREATER && FeatureMemory


namespace System.Text.RegularExpressions;

using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Diagnostics.CodeAnalysis;
using System.Diagnostics;
using System.Globalization;
using System.Threading;

// https://github.com/dotnet/runtime/blob/main/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Cache.cs
/// <summary>Cache used to store Regex instances used by the static methods on Regex.</summary>
internal sealed class RegexCache
{
// The implementation is optimized to make cache hits fast and lock-free, only taking a global lock
// when adding a new Regex to the cache. Previous implementations of the cache took a global lock
// on all accesses, negatively impacting scalability, in order to minimize costs when the cache
// limit was hit and items needed to be dropped. In such situations, however, we're having to
// pay the relatively hefty cost of creating a new Regex, anyway, and if the consuming app cares
// about such costs, it should either increase Regex.CacheSize or do its own Regex instance caching.

/// <summary>The default maximum number of items to store in the cache.</summary>
private const int DefaultMaxCacheSize = 15;
/// <summary>The maximum number of cached items to examine when we need to replace an existing one in the cache with a new one.</summary>
/// <remarks>This is a somewhat arbitrary value, chosen to be small but at least as large as DefaultMaxCacheSize.</remarks>
private const int MaxExamineOnDrop = 30;

/// <summary>A read-through cache of one element, representing the most recently used regular expression.</summary>
private static volatile Node? s_lastAccessed;
/// <summary>The thread-safe dictionary storing all the items in the cache.</summary>
/// <remarks>
/// The concurrency level is initialized to 1 as we're using our own global lock for all mutations, so we don't need ConcurrentDictionary's
/// striped locking. Capacity is initialized to 31, which is the same as (the private) ConcurrentDictionary.DefaultCapacity.
/// </remarks>
private static readonly ConcurrentDictionary<Key, Node> s_cacheDictionary = new ConcurrentDictionary<Key, Node>(concurrencyLevel: 1, capacity: 31);
/// <summary>A list of all the items in the cache. Protected by <see cref="SyncObj"/>.</summary>
private static readonly List<Node> s_cacheList = new List<Node>(DefaultMaxCacheSize);
/// <summary>Random number generator used to examine a subset of items when we need to drop one from a large list. Protected by <see cref="SyncObj"/>.</summary>
private static readonly Random s_random = new Random();
/// <summary>The current maximum number of items allowed in the cache. This rarely changes. Mostly protected by <see cref="SyncObj"/>.</summary>
private static int s_maxCacheSize = DefaultMaxCacheSize;

/// <summary>Lock used to protect shared state on mutations.</summary>
private static object SyncObj => s_cacheDictionary;

/// <summary>Gets or sets the maximum size of the cache.</summary>
public static int MaxCacheSize
{
get
{
lock (SyncObj)
{
return s_maxCacheSize;
}
}
set
{
Debug.Assert(value >= 0);

lock (SyncObj)
{
// Store the new max cache size
s_maxCacheSize = value;

if (value == 0)
{
// If the value is being changed to zero, just clear out the cache.
s_cacheDictionary.Clear();
s_cacheList.Clear();
s_lastAccessed = null;
}
else if (value < s_cacheList.Count)
{
// If the value is being changed to less than the number of items we're currently storing,
// just trim off the excess. This is almost never done in practice (if Regex.CacheSize is set
// at all, it's almost always done once towards the beginning of the process, and when it is done,
// it's typically to either 0 or to a larger value than the current limit), so we're not concerned
// with ensuring the actual oldest items are trimmed away.
s_lastAccessed = s_cacheList[0];
for (int i = value; i < s_cacheList.Count; i++)
{
s_cacheDictionary.TryRemove(s_cacheList[i].Key, out _);
}
s_cacheList.RemoveRange(value, s_cacheList.Count - value);

Debug.Assert(s_cacheList.Count == value);
Debug.Assert(s_cacheDictionary.Count == value);
}
}
}
}

public static Regex GetOrAdd(string pattern)
{
// Does not delegate to GetOrAdd(..., RegexOptions, ...) in order to avoid having
// a statically-reachable path to the 'new Regex(..., RegexOptions, ...)', which
// will force the Regex compiler to be reachable and thus rooted for trimming.

Key key = new Key(pattern, RegexOptions.None, Regex.InfiniteMatchTimeout);

Regex? regex = Get(key);
if (regex is null)
{
regex = new Regex(pattern);
Add(key, regex);
}

return regex;
}

public static Regex GetOrAdd(string pattern, RegexOptions options, TimeSpan matchTimeout)
{
Key key = new Key(pattern, options, matchTimeout);

Regex? regex = Get(key);
if (regex is null)
{
regex = new Regex(pattern, options, matchTimeout);
Add(key, regex);
}

return regex;
}

private static Regex? Get(Key key)
{
long lastAccessedStamp = 0;

// We optimize for repeated usage of the same regular expression over and over,
// by having a fast-path that stores the most recently used instance. Check
// to see if that instance is the one we want; if it is, we're done.
if (s_lastAccessed is Node lastAccessed)
{
if (key.Equals(lastAccessed.Key))
{
return lastAccessed.Regex;
}

// We had a last accessed item, but it didn't match the one being requested.
// In case we need to replace the last accessed node, remember this one's stamp;
// we'll use it to compute the new access value for the new node replacing it.
lastAccessedStamp = Volatile.Read(ref lastAccessed.LastAccessStamp);
}

// Now consult the full cache.
if (s_maxCacheSize != 0 && // hot-read of s_maxCacheSize to try to avoid the cost of the dictionary lookup if the cache is disabled
s_cacheDictionary.TryGetValue(key, out Node? node))
{
// We found our item in the cache. Make this node's last access stamp one higher than
// the previous one. It's ok if multiple threads racing to update the last access cause
// multiple nodes to have the same value; it's an approximate value meant only to help
// remove the least valuable items when an item needs to be dropped from the cache. We
// do, however, need to read the old value and write the new value using Volatile.Read/Write,
// in order to prevent tearing of the 64-bit value on 32-bit platforms, and to help ensure
// that another thread subsequently sees this updated value.
Volatile.Write(ref node.LastAccessStamp, lastAccessedStamp + 1);

// Update our fast-path single-field cache.
s_lastAccessed = node;

// Return the cached regex.
return node.Regex;
}

// Not in the cache.
return null;
}

private static void Add(Key key, Regex regex)
{
lock (SyncObj)
{
Debug.Assert(s_cacheList.Count == s_cacheDictionary.Count);

// If the cache has been disabled, there's nothing to add. And if between just checking
// the cache in the caller and taking the lock, another thread could have added the regex.
// If that occurred, there's also nothing to add, and we don't bother to update any of the
// time stamp / fast-path field information, because hitting this race condition means it
// was just updated, and we gain little by updating it again.
if (s_maxCacheSize == 0 || s_cacheDictionary.TryGetValue(key, out _))
{
return;
}

// If the cache is full, remove an item to make room for the new one.
if (s_cacheList.Count == s_maxCacheSize)
{
int itemsToExamine;
bool useRandom;

if (s_maxCacheSize <= MaxExamineOnDrop)
{
// Our maximum cache size is <= the number of items we're willing to examine (which is kept small simply
// to avoid spending a lot of time). As such, we can just examine the whole list.
itemsToExamine = s_cacheList.Count;
useRandom = false;
}
else
{
// Our maximum cache size is > the number of items we're willing to examine, so we'll instead
// examine a random subset. This isn't perfect: if the size of the list is only a tiny bit
// larger than the max we're willing to examine, there's a good chance we'll look at some of
// the same items twice. That's fine; this doesn't need to be perfect. We do not need a perfect LRU
// cache, just one that generally gets rid of older things when new things come in.
itemsToExamine = MaxExamineOnDrop;
useRandom = true;
}

// Pick the first item to use as the min.
int minListIndex = useRandom ? s_random.Next(s_cacheList.Count) : 0;
long min = Volatile.Read(ref s_cacheList[minListIndex].LastAccessStamp);

// Now examine the rest, keeping track of the smallest access stamp we find.
for (int i = 1; i < itemsToExamine; i++)
{
int nextIndex = useRandom ? s_random.Next(s_cacheList.Count) : i;
long next = Volatile.Read(ref s_cacheList[nextIndex].LastAccessStamp);
if (next < min)
{
minListIndex = nextIndex;
min = next;
}
}

// Remove the key found to have the smallest access stamp.
s_cacheDictionary.TryRemove(s_cacheList[minListIndex].Key, out _);
s_cacheList.RemoveAt(minListIndex);
}

// Finally add the regex.
var node = new Node(key, regex);
s_lastAccessed = node;
s_cacheList.Add(node);
s_cacheDictionary.TryAdd(key, node);

Debug.Assert(s_cacheList.Count <= s_maxCacheSize);
Debug.Assert(s_cacheList.Count == s_cacheDictionary.Count);
}
}

/// <summary>Used as a key for <see cref="Node"/>.</summary>
internal readonly struct Key : IEquatable<Key>
{
private readonly string _pattern;
private readonly RegexOptions _options;
private readonly TimeSpan _matchTimeout;

public Key(string pattern, RegexOptions options, TimeSpan matchTimeout)
{
Debug.Assert(pattern != null, "Pattern must be provided");

_pattern = pattern;
_options = options;
_matchTimeout = matchTimeout;
}

public override bool Equals([NotNullWhen(true)] object? obj) =>
obj is Key other && Equals(other);

public bool Equals(Key other) =>
_pattern.Equals(other._pattern) &&
_options == other._options &&
_matchTimeout == other._matchTimeout;

public override int GetHashCode() =>
// Hash code only factors in pattern and options, as regex instances are unlikely to have
// the same pattern and options but different culture and timeout.
_pattern.GetHashCode() ^ (int)_options;
}

/// <summary>Node for a cached Regex instance.</summary>
private sealed class Node(Key key, Regex regex)
{
/// <summary>The key associated with this cached instance.</summary>
public readonly Key Key = key;
/// <summary>The cached Regex instance.</summary>
public readonly Regex Regex = regex;
/// <summary>A "time" stamp representing the approximate last access time for this Regex.</summary>
public long LastAccessStamp;
}
}
#endif
4 changes: 2 additions & 2 deletions src/Polyfill/Regex/RegexPolyfill.cs
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ public static ValueMatchEnumerator EnumerateMatches(ReadOnlySpan<char> input, st
#if NET7_0_OR_GREATER
return Regex.EnumerateMatches(input, pattern);
#else
return new Regex(pattern).EnumerateMatches(input);
return RegexCache.GetOrAdd(pattern).EnumerateMatches(input);
#endif
}

Expand All @@ -87,7 +87,7 @@ public static ValueMatchEnumerator EnumerateMatches(ReadOnlySpan<char> input, st
#if NET7_0_OR_GREATER
return Regex.EnumerateMatches(input, pattern, options, timeout);
#else
return new Regex(pattern, options, timeout).EnumerateMatches(input);
return RegexCache.GetOrAdd(pattern, options, timeout).EnumerateMatches(input);
#endif
}

Expand Down

0 comments on commit c86ed61

Please sign in to comment.