Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

reduce some string allocation in Vocabulary #1355

Merged
merged 1 commit into from
Feb 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/Humanizer/GlobalUsings.cs
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
global using System.Globalization;
global using System.Text.RegularExpressions;
308 changes: 152 additions & 156 deletions src/Humanizer/Inflections/Vocabulary.cs
Original file line number Diff line number Diff line change
@@ -1,202 +1,198 @@
using System.Text.RegularExpressions;

namespace Humanizer
namespace Humanizer;

/// <summary>
/// A container for exceptions to simple pluralization/singularization rules.
/// Vocabularies.Default contains an extensive list of rules for US English.
/// At this time, multiple vocabularies and removing existing rules are not supported.
/// </summary>
public class Vocabulary
{
internal Vocabulary()
{
}

readonly List<Rule> plurals = [];
readonly List<Rule> singulars = [];
readonly HashSet<string> uncountables = new(StringComparer.CurrentCultureIgnoreCase);
readonly Regex letterS = new("^([sS])[sS]*$");

/// <summary>
/// A container for exceptions to simple pluralization/singularization rules.
/// Vocabularies.Default contains an extensive list of rules for US English.
/// At this time, multiple vocabularies and removing existing rules are not supported.
/// Adds a word to the vocabulary which cannot easily be pluralized/singularized by RegEx, e.g. "person" and "people".
/// </summary>
public class Vocabulary
/// <param name="singular">The singular form of the irregular word, e.g. "person".</param>
/// <param name="plural">The plural form of the irregular word, e.g. "people".</param>
/// <param name="matchEnding">True to match these words on their own as well as at the end of longer words. False, otherwise.</param>
public void AddIrregular(string singular, string plural, bool matchEnding = true)
{
internal Vocabulary()
if (matchEnding)
{
var singularSubstring = singular.Substring(1);
var pluralSubString = plural.Substring(1);
AddPlural($"({singular[0]}){singularSubstring}$", $"$1{pluralSubString}");
AddSingular($"({plural[0]}){pluralSubString}$", $"$1{singularSubstring}");
}

private readonly List<Rule> _plurals = new List<Rule>();
private readonly List<Rule> _singulars = new List<Rule>();
private readonly HashSet<string> _uncountables = new(StringComparer.CurrentCultureIgnoreCase);
private readonly Regex _letterS = new Regex("^([sS])[sS]*$");

/// <summary>
/// Adds a word to the vocabulary which cannot easily be pluralized/singularized by RegEx, e.g. "person" and "people".
/// </summary>
/// <param name="singular">The singular form of the irregular word, e.g. "person".</param>
/// <param name="plural">The plural form of the irregular word, e.g. "people".</param>
/// <param name="matchEnding">True to match these words on their own as well as at the end of longer words. False, otherwise.</param>
public void AddIrregular(string singular, string plural, bool matchEnding = true)
else
{
if (matchEnding)
{
AddPlural("(" + singular[0] + ")" + singular.Substring(1) + "$", "$1" + plural.Substring(1));
AddSingular("(" + plural[0] + ")" + plural.Substring(1) + "$", "$1" + singular.Substring(1));
}
else
{
AddPlural($"^{singular}$", plural);
AddSingular($"^{plural}$", singular);
}
AddPlural($"^{singular}$", plural);
AddSingular($"^{plural}$", singular);
}
}

/// <summary>
/// Adds an uncountable word to the vocabulary, e.g. "fish". Will be ignored when plurality is changed.
/// </summary>
/// <param name="word">Word to be added to the list of uncountables.</param>
public void AddUncountable(string word)
{
_uncountables.Add(word);
}
/// <summary>
/// Adds an uncountable word to the vocabulary, e.g. "fish". Will be ignored when plurality is changed.
/// </summary>
/// <param name="word">Word to be added to the list of uncountables.</param>
public void AddUncountable(string word) =>
uncountables.Add(word);

/// <summary>
/// Adds a rule to the vocabulary that does not follow trivial rules for pluralization, e.g. "bus" -> "buses"
/// </summary>
/// <param name="rule">RegEx to be matched, case insensitive, e.g. "(bus)es$"</param>
/// <param name="replacement">RegEx replacement e.g. "$1"</param>
public void AddPlural(string rule, string replacement)
{
_plurals.Add(new Rule(rule, replacement));
}
/// <summary>
/// Adds a rule to the vocabulary that does not follow trivial rules for pluralization, e.g. "bus" -> "buses"
/// </summary>
/// <param name="rule">RegEx to be matched, case insensitive, e.g. "(bus)es$"</param>
/// <param name="replacement">RegEx replacement e.g. "$1"</param>
public void AddPlural(string rule, string replacement) =>
plurals.Add(new(rule, replacement));

/// <summary>
/// Adds a rule to the vocabulary that does not follow trivial rules for singularization, e.g. "vertices/indices -> "vertex/index"
/// </summary>
/// <param name="rule">RegEx to be matched, case insensitive, e.g. ""(vert|ind)ices$""</param>
/// <param name="replacement">RegEx replacement e.g. "$1ex"</param>
public void AddSingular(string rule, string replacement)
{
_singulars.Add(new Rule(rule, replacement));
}
/// <summary>
/// Adds a rule to the vocabulary that does not follow trivial rules for singularization, e.g. "vertices/indices -> "vertex/index"
/// </summary>
/// <param name="rule">RegEx to be matched, case insensitive, e.g. ""(vert|ind)ices$""</param>
/// <param name="replacement">RegEx replacement e.g. "$1ex"</param>
public void AddSingular(string rule, string replacement) =>
singulars.Add(new(rule, replacement));

/// <summary>
/// Pluralizes the provided input considering irregular words
/// </summary>
/// <param name="word">Word to be pluralized</param>
/// <param name="inputIsKnownToBeSingular">Normally you call Pluralize on singular words; but if you're unsure call it with false</param>
public string Pluralize(string word, bool inputIsKnownToBeSingular = true)
/// <summary>
/// Pluralizes the provided input considering irregular words
/// </summary>
/// <param name="word">Word to be pluralized</param>
/// <param name="inputIsKnownToBeSingular">Normally you call Pluralize on singular words; but if you're unsure call it with false</param>
public string Pluralize(string word, bool inputIsKnownToBeSingular = true)
{
var s = LetterS(word);
if (s != null)
{
var s = LetterS(word);
if (s != null)
{
return s + "s";
}
return s + "s";
}

var result = ApplyRules(_plurals, word, false);
var result = ApplyRules(plurals, word, false);

if (inputIsKnownToBeSingular)
{
return result ?? word;
}

var asSingular = ApplyRules(_singulars, word, false);
var asSingularAsPlural = ApplyRules(_plurals, asSingular, false);
if (asSingular != null && asSingular != word && asSingular + "s" != word && asSingularAsPlural == word && result != word)
{
return word;
}

return result;
if (inputIsKnownToBeSingular)
{
return result ?? word;
}

/// <summary>
/// Singularizes the provided input considering irregular words
/// </summary>
/// <param name="word">Word to be singularized</param>
/// <param name="inputIsKnownToBePlural">Normally you call Singularize on plural words; but if you're unsure call it with false</param>
/// <param name="skipSimpleWords">Skip singularizing single words that have an 's' on the end</param>
public string Singularize(string word, bool inputIsKnownToBePlural = true, bool skipSimpleWords = false)
var asSingular = ApplyRules(singulars, word, false);
var asSingularAsPlural = ApplyRules(plurals, asSingular, false);
if (asSingular != null &&
asSingular != word &&
asSingular + "s" != word &&
asSingularAsPlural == word &&
result != word)
{
var s = LetterS(word);
if (s != null)
{
return s;
}
return word;
}

var result = ApplyRules(_singulars, word, skipSimpleWords);
return result;
}

if (inputIsKnownToBePlural)
{
return result ?? word;
}
/// <summary>
/// Singularizes the provided input considering irregular words
/// </summary>
/// <param name="word">Word to be singularized</param>
/// <param name="inputIsKnownToBePlural">Normally you call Singularize on plural words; but if you're unsure call it with false</param>
/// <param name="skipSimpleWords">Skip singularizing single words that have an 's' on the end</param>
public string Singularize(string word, bool inputIsKnownToBePlural = true, bool skipSimpleWords = false)
{
var s = LetterS(word);
if (s != null)
{
return s;
}

// the Plurality is unknown so we should check all possibilities
var asPlural = ApplyRules(_plurals, word, false);
var asPluralAsSingular = ApplyRules(_singulars, asPlural, false);
if (asPlural != word && word + "s" != asPlural && asPluralAsSingular == word && result != word)
{
return word;
}
var result = ApplyRules(singulars, word, skipSimpleWords);

if (inputIsKnownToBePlural)
{
return result ?? word;
}

private string ApplyRules(IList<Rule> rules, string word, bool skipFirstRule)
// the Plurality is unknown so we should check all possibilities
var asPlural = ApplyRules(plurals, word, false);
var asPluralAsSingular = ApplyRules(singulars, asPlural, false);
if (asPlural == word ||
word + "s" == asPlural ||
asPluralAsSingular != word ||
result == word)
{
if (word == null)
{
return null;
}

if (word.Length < 1)
{
return word;
}
return result ?? word;
}

if (IsUncountable(word))
{
return word;
}
return word;
}

var result = word;
var end = skipFirstRule ? 1 : 0;
for (var i = rules.Count - 1; i >= end; i--)
{
if ((result = rules[i].Apply(word)) != null)
{
break;
}
}
return result != null ? MatchUpperCase(word, result) : result;
string ApplyRules(IList<Rule> rules, string word, bool skipFirstRule)
{
if (word == null)
{
return null;
}

private bool IsUncountable(string word)
if (word.Length < 1)
{
return _uncountables.Contains(word);
return word;
}

private string MatchUpperCase(string word, string replacement)
if (IsUncountable(word))
{
return char.IsUpper(word[0]) && char.IsLower(replacement[0]) ? char.ToUpper(replacement[0]) + replacement.Substring(1) : replacement;
return word;
}

/// <summary>
/// If the word is the letter s, singular or plural, return the letter s singular
/// </summary>
private string LetterS(string word)
var result = word;
var end = skipFirstRule ? 1 : 0;
for (var i = rules.Count - 1; i >= end; i--)
{
var s = _letterS.Match(word);
return s.Groups.Count > 1 ? s.Groups[1].Value : null;
if ((result = rules[i].Apply(word)) != null)
{
break;
}
}

private class Rule
if (result == null)
{
private readonly Regex _regex;
private readonly string _replacement;
return null;
}

public Rule(string pattern, string replacement)
{
_regex = new Regex(pattern, RegexOptions.IgnoreCase | RegexOptionsUtil.Compiled);
_replacement = replacement;
}
return MatchUpperCase(word, result);
}

public string Apply(string word)
{
if (!_regex.IsMatch(word))
{
return null;
}
bool IsUncountable(string word) =>
uncountables.Contains(word);

return _regex.Replace(word, _replacement);
static string MatchUpperCase(string word, string replacement) =>
char.IsUpper(word[0]) &&
char.IsLower(replacement[0]) ? char.ToUpper(replacement[0]) + replacement.Substring(1) : replacement;

/// <summary>
/// If the word is the letter s, singular or plural, return the letter s singular
/// </summary>
string LetterS(string word)
{
var s = letterS.Match(word);
return s.Groups.Count > 1 ? s.Groups[1].Value : null;
}

class Rule(string pattern, string replacement)
{
private readonly Regex regex = new(pattern, RegexOptions.IgnoreCase | RegexOptionsUtil.Compiled);

public string Apply(string word)
{
if (!regex.IsMatch(word))
{
return null;
}

return regex.Replace(word, replacement);
}
}
}
}