From c86ed61a58f8ac2ebdabafb5212331e289584a99 Mon Sep 17 00:00:00 2001 From: Simon Cropp Date: Tue, 20 Feb 2024 22:31:11 +1100 Subject: [PATCH] Add RegexCache for EnumerateMatches (#137) --- src/Directory.Build.props | 2 +- src/Polyfill/Regex/RegexCache.cs | 290 ++++++++++++++++++++++++++++ src/Polyfill/Regex/RegexPolyfill.cs | 4 +- 3 files changed, 293 insertions(+), 3 deletions(-) create mode 100644 src/Polyfill/Regex/RegexCache.cs diff --git a/src/Directory.Build.props b/src/Directory.Build.props index 36c11a1f..79148eef 100644 --- a/src/Directory.Build.props +++ b/src/Directory.Build.props @@ -1,7 +1,7 @@ - 2.6.0 + 2.6.1 1.0.0 Polyfill true diff --git a/src/Polyfill/Regex/RegexCache.cs b/src/Polyfill/Regex/RegexCache.cs new file mode 100644 index 00000000..e7597238 --- /dev/null +++ b/src/Polyfill/Regex/RegexCache.cs @@ -0,0 +1,290 @@ +// +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +#pragma warning disable + +#if !NET7_0_OR_GREATER && FeatureMemory + + +namespace System.Text.RegularExpressions; + +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Diagnostics; +using System.Globalization; +using System.Threading; + +// https://github.com/dotnet/runtime/blob/main/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.Cache.cs +/// Cache used to store Regex instances used by the static methods on Regex. +internal sealed class RegexCache +{ + // The implementation is optimized to make cache hits fast and lock-free, only taking a global lock + // when adding a new Regex to the cache. Previous implementations of the cache took a global lock + // on all accesses, negatively impacting scalability, in order to minimize costs when the cache + // limit was hit and items needed to be dropped. In such situations, however, we're having to + // pay the relatively hefty cost of creating a new Regex, anyway, and if the consuming app cares + // about such costs, it should either increase Regex.CacheSize or do its own Regex instance caching. + + /// The default maximum number of items to store in the cache. + private const int DefaultMaxCacheSize = 15; + /// The maximum number of cached items to examine when we need to replace an existing one in the cache with a new one. + /// This is a somewhat arbitrary value, chosen to be small but at least as large as DefaultMaxCacheSize. + private const int MaxExamineOnDrop = 30; + + /// A read-through cache of one element, representing the most recently used regular expression. + private static volatile Node? s_lastAccessed; + /// The thread-safe dictionary storing all the items in the cache. + /// + /// The concurrency level is initialized to 1 as we're using our own global lock for all mutations, so we don't need ConcurrentDictionary's + /// striped locking. Capacity is initialized to 31, which is the same as (the private) ConcurrentDictionary.DefaultCapacity. + /// + private static readonly ConcurrentDictionary s_cacheDictionary = new ConcurrentDictionary(concurrencyLevel: 1, capacity: 31); + /// A list of all the items in the cache. Protected by . + private static readonly List s_cacheList = new List(DefaultMaxCacheSize); + /// Random number generator used to examine a subset of items when we need to drop one from a large list. Protected by . + private static readonly Random s_random = new Random(); + /// The current maximum number of items allowed in the cache. This rarely changes. Mostly protected by . + private static int s_maxCacheSize = DefaultMaxCacheSize; + + /// Lock used to protect shared state on mutations. + private static object SyncObj => s_cacheDictionary; + + /// Gets or sets the maximum size of the cache. + public static int MaxCacheSize + { + get + { + lock (SyncObj) + { + return s_maxCacheSize; + } + } + set + { + Debug.Assert(value >= 0); + + lock (SyncObj) + { + // Store the new max cache size + s_maxCacheSize = value; + + if (value == 0) + { + // If the value is being changed to zero, just clear out the cache. + s_cacheDictionary.Clear(); + s_cacheList.Clear(); + s_lastAccessed = null; + } + else if (value < s_cacheList.Count) + { + // If the value is being changed to less than the number of items we're currently storing, + // just trim off the excess. This is almost never done in practice (if Regex.CacheSize is set + // at all, it's almost always done once towards the beginning of the process, and when it is done, + // it's typically to either 0 or to a larger value than the current limit), so we're not concerned + // with ensuring the actual oldest items are trimmed away. + s_lastAccessed = s_cacheList[0]; + for (int i = value; i < s_cacheList.Count; i++) + { + s_cacheDictionary.TryRemove(s_cacheList[i].Key, out _); + } + s_cacheList.RemoveRange(value, s_cacheList.Count - value); + + Debug.Assert(s_cacheList.Count == value); + Debug.Assert(s_cacheDictionary.Count == value); + } + } + } + } + + public static Regex GetOrAdd(string pattern) + { + // Does not delegate to GetOrAdd(..., RegexOptions, ...) in order to avoid having + // a statically-reachable path to the 'new Regex(..., RegexOptions, ...)', which + // will force the Regex compiler to be reachable and thus rooted for trimming. + + Key key = new Key(pattern, RegexOptions.None, Regex.InfiniteMatchTimeout); + + Regex? regex = Get(key); + if (regex is null) + { + regex = new Regex(pattern); + Add(key, regex); + } + + return regex; + } + + public static Regex GetOrAdd(string pattern, RegexOptions options, TimeSpan matchTimeout) + { + Key key = new Key(pattern, options, matchTimeout); + + Regex? regex = Get(key); + if (regex is null) + { + regex = new Regex(pattern, options, matchTimeout); + Add(key, regex); + } + + return regex; + } + + private static Regex? Get(Key key) + { + long lastAccessedStamp = 0; + + // We optimize for repeated usage of the same regular expression over and over, + // by having a fast-path that stores the most recently used instance. Check + // to see if that instance is the one we want; if it is, we're done. + if (s_lastAccessed is Node lastAccessed) + { + if (key.Equals(lastAccessed.Key)) + { + return lastAccessed.Regex; + } + + // We had a last accessed item, but it didn't match the one being requested. + // In case we need to replace the last accessed node, remember this one's stamp; + // we'll use it to compute the new access value for the new node replacing it. + lastAccessedStamp = Volatile.Read(ref lastAccessed.LastAccessStamp); + } + + // Now consult the full cache. + if (s_maxCacheSize != 0 && // hot-read of s_maxCacheSize to try to avoid the cost of the dictionary lookup if the cache is disabled + s_cacheDictionary.TryGetValue(key, out Node? node)) + { + // We found our item in the cache. Make this node's last access stamp one higher than + // the previous one. It's ok if multiple threads racing to update the last access cause + // multiple nodes to have the same value; it's an approximate value meant only to help + // remove the least valuable items when an item needs to be dropped from the cache. We + // do, however, need to read the old value and write the new value using Volatile.Read/Write, + // in order to prevent tearing of the 64-bit value on 32-bit platforms, and to help ensure + // that another thread subsequently sees this updated value. + Volatile.Write(ref node.LastAccessStamp, lastAccessedStamp + 1); + + // Update our fast-path single-field cache. + s_lastAccessed = node; + + // Return the cached regex. + return node.Regex; + } + + // Not in the cache. + return null; + } + + private static void Add(Key key, Regex regex) + { + lock (SyncObj) + { + Debug.Assert(s_cacheList.Count == s_cacheDictionary.Count); + + // If the cache has been disabled, there's nothing to add. And if between just checking + // the cache in the caller and taking the lock, another thread could have added the regex. + // If that occurred, there's also nothing to add, and we don't bother to update any of the + // time stamp / fast-path field information, because hitting this race condition means it + // was just updated, and we gain little by updating it again. + if (s_maxCacheSize == 0 || s_cacheDictionary.TryGetValue(key, out _)) + { + return; + } + + // If the cache is full, remove an item to make room for the new one. + if (s_cacheList.Count == s_maxCacheSize) + { + int itemsToExamine; + bool useRandom; + + if (s_maxCacheSize <= MaxExamineOnDrop) + { + // Our maximum cache size is <= the number of items we're willing to examine (which is kept small simply + // to avoid spending a lot of time). As such, we can just examine the whole list. + itemsToExamine = s_cacheList.Count; + useRandom = false; + } + else + { + // Our maximum cache size is > the number of items we're willing to examine, so we'll instead + // examine a random subset. This isn't perfect: if the size of the list is only a tiny bit + // larger than the max we're willing to examine, there's a good chance we'll look at some of + // the same items twice. That's fine; this doesn't need to be perfect. We do not need a perfect LRU + // cache, just one that generally gets rid of older things when new things come in. + itemsToExamine = MaxExamineOnDrop; + useRandom = true; + } + + // Pick the first item to use as the min. + int minListIndex = useRandom ? s_random.Next(s_cacheList.Count) : 0; + long min = Volatile.Read(ref s_cacheList[minListIndex].LastAccessStamp); + + // Now examine the rest, keeping track of the smallest access stamp we find. + for (int i = 1; i < itemsToExamine; i++) + { + int nextIndex = useRandom ? s_random.Next(s_cacheList.Count) : i; + long next = Volatile.Read(ref s_cacheList[nextIndex].LastAccessStamp); + if (next < min) + { + minListIndex = nextIndex; + min = next; + } + } + + // Remove the key found to have the smallest access stamp. + s_cacheDictionary.TryRemove(s_cacheList[minListIndex].Key, out _); + s_cacheList.RemoveAt(minListIndex); + } + + // Finally add the regex. + var node = new Node(key, regex); + s_lastAccessed = node; + s_cacheList.Add(node); + s_cacheDictionary.TryAdd(key, node); + + Debug.Assert(s_cacheList.Count <= s_maxCacheSize); + Debug.Assert(s_cacheList.Count == s_cacheDictionary.Count); + } + } + + /// Used as a key for . + internal readonly struct Key : IEquatable + { + private readonly string _pattern; + private readonly RegexOptions _options; + private readonly TimeSpan _matchTimeout; + + public Key(string pattern, RegexOptions options, TimeSpan matchTimeout) + { + Debug.Assert(pattern != null, "Pattern must be provided"); + + _pattern = pattern; + _options = options; + _matchTimeout = matchTimeout; + } + + public override bool Equals([NotNullWhen(true)] object? obj) => + obj is Key other && Equals(other); + + public bool Equals(Key other) => + _pattern.Equals(other._pattern) && + _options == other._options && + _matchTimeout == other._matchTimeout; + + public override int GetHashCode() => + // Hash code only factors in pattern and options, as regex instances are unlikely to have + // the same pattern and options but different culture and timeout. + _pattern.GetHashCode() ^ (int)_options; + } + + /// Node for a cached Regex instance. + private sealed class Node(Key key, Regex regex) + { + /// The key associated with this cached instance. + public readonly Key Key = key; + /// The cached Regex instance. + public readonly Regex Regex = regex; + /// A "time" stamp representing the approximate last access time for this Regex. + public long LastAccessStamp; + } +} +#endif \ No newline at end of file diff --git a/src/Polyfill/Regex/RegexPolyfill.cs b/src/Polyfill/Regex/RegexPolyfill.cs index e2e92e7d..0c8d9285 100644 --- a/src/Polyfill/Regex/RegexPolyfill.cs +++ b/src/Polyfill/Regex/RegexPolyfill.cs @@ -73,7 +73,7 @@ public static ValueMatchEnumerator EnumerateMatches(ReadOnlySpan input, st #if NET7_0_OR_GREATER return Regex.EnumerateMatches(input, pattern); #else - return new Regex(pattern).EnumerateMatches(input); + return RegexCache.GetOrAdd(pattern).EnumerateMatches(input); #endif } @@ -87,7 +87,7 @@ public static ValueMatchEnumerator EnumerateMatches(ReadOnlySpan input, st #if NET7_0_OR_GREATER return Regex.EnumerateMatches(input, pattern, options, timeout); #else - return new Regex(pattern, options, timeout).EnumerateMatches(input); + return RegexCache.GetOrAdd(pattern, options, timeout).EnumerateMatches(input); #endif }