Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve ARP matching heuristic #2179

Merged
merged 4 commits into from
May 25, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/AppInstallerCLITests/ARPChanges.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <Workflows/WorkflowBase.h>
#include <Workflows/InstallFlow.h>
#include <winget/Manifest.h>
#include <winget/ARPCorrelationAlgorithms.h>
#include <Microsoft/PredefinedInstalledSourceFactory.h>

using namespace TestCommon;
Expand Down
33 changes: 21 additions & 12 deletions src/AppInstallerCLITests/Correlation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@
#include "TestSource.h"

#include <winget/ARPCorrelation.h>
#include <winget/ARPCorrelationAlgorithms.h>
#include <winget/Manifest.h>
#include <winget/RepositorySearch.h>

using namespace AppInstaller::Manifest;
using namespace AppInstaller::Repository;
using namespace AppInstaller::Repository::Correlation;
using namespace AppInstaller::Utility;

using namespace TestCommon;

Expand Down Expand Up @@ -76,13 +78,18 @@ Manifest GetManifestFromTestCase(const TestCase& testCase)
return manifest;
}

ARPEntry GetARPEntryFromTestCase(const TestCase& testCase)
ARPEntry GetARPEntryFromTestCase(const TestCase& testCase, bool isNew)
{
Manifest arpManifest;
arpManifest.DefaultLocalization.Add<Localization::PackageName>(testCase.ARPName);
arpManifest.DefaultLocalization.Add<Localization::Publisher>(testCase.ARPPublisher);
arpManifest.Localizations.push_back(arpManifest.DefaultLocalization);
return ARPEntry{ TestPackage::Make(arpManifest, TestPackage::MetadataMap{}), false };
return ARPEntry{ TestPackage::Make(arpManifest, TestPackage::MetadataMap{}), isNew };
}

ARPEntry GetExistingARPEntryFromTestCase(const TestCase& testCase)
{
return GetARPEntryFromTestCase(testCase, /* isNew */ false);
}

void ReportMatch(std::string_view label, std::string_view appName, std::string_view appPublisher, std::string_view arpName, std::string_view arpPublisher)
Expand All @@ -105,7 +112,7 @@ ResultSummary EvaluateDataSetWithHeuristic(const DataSet& dataSet, IARPMatchConf

for (const auto& testCase : dataSet.TestCases)
{
arpEntries.push_back(GetARPEntryFromTestCase(testCase));
arpEntries.push_back(GetARPEntryFromTestCase(testCase, /* isNew */ true));
auto match = FindARPEntryForNewlyInstalledPackageWithHeuristics(GetManifestFromTestCase(testCase), arpEntries, correlationAlgorithm);
arpEntries.pop_back();

Expand All @@ -114,7 +121,9 @@ ResultSummary EvaluateDataSetWithHeuristic(const DataSet& dataSet, IARPMatchConf
auto matchName = match->GetProperty(PackageVersionProperty::Name);
auto matchPublisher = match->GetProperty(PackageVersionProperty::Publisher);

if (matchName == testCase.ARPName && matchPublisher == testCase.ARPPublisher)
// The strings get normalized when added to the manifest, so we have
// to normalize for the comparison.
if (matchName == NormalizedString(testCase.ARPName) && matchPublisher == NormalizedString(testCase.ARPPublisher))
{
++result.TrueMatches;
}
Expand Down Expand Up @@ -225,10 +234,10 @@ DataSet GetDataSet_NoNoise()
dataSet.TestCases = LoadTestData();

// Arbitrary values. We should refine them as the algorithm gets better.
dataSet.RequiredTrueMatchRatio = 0.7;
dataSet.RequiredFalseMatchRatio = 0.05;
dataSet.RequiredTrueMatchRatio = 0.81;
dataSet.RequiredFalseMatchRatio = 0;
dataSet.RequiredTrueMismatchRatio = 0; // There are no expected mismatches in this data set
dataSet.RequiredFalseMismatchRatio = 0.3;
dataSet.RequiredFalseMismatchRatio = 0.25;

return dataSet;
}
Expand All @@ -238,14 +247,14 @@ DataSet GetDataSet_WithNoise()
DataSet dataSet;
auto baseTestCases = LoadTestData();

std::transform(baseTestCases.begin(), baseTestCases.end(), std::back_inserter(dataSet.ARPNoise), GetARPEntryFromTestCase);
std::transform(baseTestCases.begin(), baseTestCases.end(), std::back_inserter(dataSet.ARPNoise), GetExistingARPEntryFromTestCase);
dataSet.TestCases = std::move(baseTestCases);

// Arbitrary values. We should refine them as the algorithm gets better.
dataSet.RequiredTrueMatchRatio = 0.7;
dataSet.RequiredFalseMatchRatio = 0.05;
dataSet.RequiredTrueMatchRatio = 0.81;
dataSet.RequiredFalseMatchRatio = 0; // This should always stay at 0
dataSet.RequiredTrueMismatchRatio = 0; // There are no expected mismatches in this data set
dataSet.RequiredFalseMismatchRatio = 0.3;
dataSet.RequiredFalseMismatchRatio = 0.25;

return dataSet;
}
Expand All @@ -256,7 +265,7 @@ DataSet GetDataSet_WithNoise()
// performs well.
TEMPLATE_TEST_CASE("Correlation_MeasureAlgorithmPerformance", "[correlation][.]",
EmptyMatchConfidenceAlgorithm,
EditDistanceMatchConfidenceAlgorithm)
WordsEditDistanceMatchConfidenceAlgorithm)
{
// Each section loads a different data set,
// and then they are all handled the same
Expand Down
8 changes: 8 additions & 0 deletions src/AppInstallerCLITests/NameNormalization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,3 +131,11 @@ TEST_CASE("NameNorm_KBNumbers", "[name_norm]")

REQUIRE(normer.Normalize("Fix for (KB42)", {}).Name() == "FixforKB42");
}

TEST_CASE("NameNorm_Initial_PreserveWhitespace", "[name_norm]")
{
NameNormalizer normer(NormalizationVersion::InitialPreserveWhiteSpace);

REQUIRE(normer.NormalizeName("Some Name").Name() == "Some Name");
REQUIRE(normer.NormalizePublisher("Some Publisher Corp") == "Some Publisher");
}
10 changes: 10 additions & 0 deletions src/AppInstallerCLITests/Strings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,4 +193,14 @@ TEST_CASE("GetFileNameFromURI", "[strings]")
REQUIRE(GetFileNameFromURI("https://github.com/microsoft/winget-cli/pull/1722").u8string() == "1722");
REQUIRE(GetFileNameFromURI("https://github.com/microsoft/winget-cli/README.md").u8string() == "README.md");
REQUIRE(GetFileNameFromURI("https://microsoft.com/").u8string() == "");
}

TEST_CASE("SplitIntoWords", "[strings]")
{
REQUIRE(SplitIntoWords("A B") == std::vector<std::string>{ "A", "B" });
REQUIRE(SplitIntoWords("Some-Thing") == std::vector<std::string>{ "Some", "Thing" });

// 私のテスト = "My test" according to an online translator
// Split as "私" "の" "テスト"
REQUIRE(SplitIntoWords("\xe7\xa7\x81\xe3\x81\xae\xe3\x83\x86\xe3\x82\xb9\xe3\x83\x88") == std::vector<std::string>{ "\xe7\xa7\x81", "\xe3\x81\xae", "\xe3\x83\x86\xe3\x82\xb9\xe3\x83\x88" });
}
3 changes: 2 additions & 1 deletion src/AppInstallerCLITests/TestData/InputARPData.txt
Original file line number Diff line number Diff line change
Expand Up @@ -108,4 +108,5 @@ XPFP42J061BPC1|Documenti Lavori Cantiere|Esposito Software di M. G. Caputo|Docum
XPFPFN4LT21PZJ|Studio Dentistico Pro|Esposito Software di M. G. Caputo|Studio Dentistico Pro Demo||Copyright Esposito Software|Studio Dentistico Pro Demo_is1
XPFPFWMVTR0WHP|Ashampoo UnInstaller 11|Ashampoo|Ashampoo UnInstaller 11|11.00.12|Ashampoo GmbH & Co. KG|{4209F371-B84B-F321-6BD3-1D91E2505732}_is1
XPFPFWV5JD80K2|BeeCut|网旭科技|BeeCut V1.7.7.22|1.7.7.22|Apowersoft LIMITED|{CA76BFA8-1862-49D7-B2C7-AE3D6CF40E53}_is1
XPFPLCB36G8V8J|HttpMaster Professional|Borvid, Informacijske storitve, Janez Čas s.p.|HttpMaster Professional Edition 5.4.1|5.4.1|Borvid|{B61241AA-F5FC-42C9-A1F9-F6D72D654349}
XPFPLCB36G8V8J|HttpMaster Professional|Borvid, Informacijske storitve, Janez Čas s.p.|HttpMaster Professional Edition 5.4.1|5.4.1|Borvid|{B61241AA-F5FC-42C9-A1F9-F6D72D654349}
XP8CDJNZKFM06W|Visual Studio Community 2019|Microsoft Corporation|Microsoft Visual Studio Installer|3.1.2196.8931|Microsoft Corporation|{6F320B93-EE3C-4826-85E0-ADF79F8D4C61}
29 changes: 29 additions & 0 deletions src/AppInstallerCommonCore/AppInstallerStrings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,12 @@ namespace AppInstaller::Utility
return utext_char32At(m_text.get(), m_currentBrk);
}

// Returns the status from the break rule that determined the most recently break position.
int32_t CurrentRuleStatus()
{
return ubrk_getRuleStatus(m_brk.get());
}

private:
wil::unique_any<UText*, decltype(utext_close), &utext_close> m_text;
wil::unique_any<UBreakIterator*, decltype(ubrk_close), &ubrk_close> m_brk;
Expand Down Expand Up @@ -628,4 +634,27 @@ namespace AppInstaller::Utility

return path.filename();
}

std::vector<std::string> SplitIntoWords(std::string_view input)
JohnMcPMS marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add a test with a few cases in it. Preferably also with some non-English strings, especially from a language without spaces.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a couple of test cases

{
ICUBreakIterator itr{ input, UBRK_WORD };
std::size_t currentOffset = 0;

std::vector<std::string> result;
while (itr.Next() != UBRK_DONE)
{
std::size_t nextOffset = itr.CurrentOffset();

// Ignore spaces and punctuation, accept words and numbers
if (itr.CurrentRuleStatus() != UBRK_WORD_NONE)
{
auto wordSize = nextOffset - currentOffset;
result.emplace_back(input, currentOffset, wordSize);
}

currentOffset = nextOffset;
}

return result;
}
}
50 changes: 41 additions & 9 deletions src/AppInstallerCommonCore/NameNormalization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -215,12 +215,22 @@ namespace AppInstaller::Utility
}

// Joins all of the given strings into a single value
static std::wstring Join(const std::vector<std::wstring>& values)
static std::wstring Join(const std::vector<std::wstring>& values, const std::wstring& separator = {})
{
std::wstring result;

bool isFirst = true;
for (const auto& v : values)
{
if (isFirst)
{
isFirst = false;
}
else
{
result += separator;
}

result += v;
}

Expand All @@ -246,6 +256,7 @@ namespace AppInstaller::Utility
Regex::Expression KBNumbers{ R"(\((KB\d+)\))", reOptions };

Regex::Expression NonLettersAndDigits{ R"([^\p{L}\p{Nd}])", reOptions };
Regex::Expression NonLetterDigitOrSpace{ R"([^\p{L}\p{Nd}\s])", reOptions };
Regex::Expression URIProtocol{ R"((?<!\p{L})(?:http[s]?|ftp):\/\/)", reOptions }; // remove protocol from URIs

Regex::Expression VersionDelimited{ R"(((?<!\p{L})(?:V|VER|VERSI(?:O|Ó)N|VERSÃO|VERSIE|WERSJA|BUILD|RELEASE|RC|SP)\P{L}?)?\p{Nd}+([\p{Po}\p{Pd}\p{Pc}]\p{Nd}?(RC|B|A|R|SP|K)?\p{Nd}+)+([\p{Po}\p{Pd}\p{Pc}]?[\p{L}\p{Nd}]+)*)", reOptions };
Expand Down Expand Up @@ -347,6 +358,8 @@ namespace AppInstaller::Utility
// The folded and sorted version of LocaleViews.
const std::vector<std::wstring> LegalEntitySuffixes;

const bool PreserveWhiteSpace;

static std::vector<std::wstring> FoldAndSort(const std::vector<std::wstring_view>& input)
{
std::vector<std::wstring> result;
Expand Down Expand Up @@ -377,10 +390,18 @@ namespace AppInstaller::Utility
while (RemoveAll(ProgramNameRegexes, result.Name));

auto tokens = Split(ProgramNameSplit, result.Name, LegalEntitySuffixes);
result.Name = Join(tokens);

// Drop all undesired characters
Remove(NonLettersAndDigits, result.Name);
// Re-join the tokens and drop all undesired characters
if (PreserveWhiteSpace)
{
result.Name = Join(tokens, L" ");
Remove(NonLetterDigitOrSpace, result.Name);
}
else
{
result.Name = Join(tokens);
Remove(NonLettersAndDigits, result.Name);
}

return result;
}
Expand All @@ -395,16 +416,24 @@ namespace AppInstaller::Utility
while (RemoveAll(PublisherNameRegexes, result.Publisher));

auto tokens = Split(PublisherNameSplit, result.Publisher, LegalEntitySuffixes, true);
result.Publisher = Join(tokens);

// Drop all undesired characters
Remove(NonLettersAndDigits, result.Publisher);
// Re-join the tokens and drop all undesired characters
if (PreserveWhiteSpace)
{
result.Publisher = Join(tokens, L" ");
Remove(NonLetterDigitOrSpace, result.Publisher);
}
else
{
result.Publisher = Join(tokens);
Remove(NonLettersAndDigits, result.Publisher);
}

return result;
}

public:
NormalizationInitial() : Locales(FoldAndSort(LocaleViews)), LegalEntitySuffixes(FoldAndSort(LegalEntitySuffixViews))
NormalizationInitial(bool preserveWhiteSpace) : Locales(FoldAndSort(LocaleViews)), LegalEntitySuffixes(FoldAndSort(LegalEntitySuffixViews)), PreserveWhiteSpace(preserveWhiteSpace)
{
}

Expand Down Expand Up @@ -448,7 +477,10 @@ namespace AppInstaller::Utility
switch (version)
{
case AppInstaller::Utility::NormalizationVersion::Initial:
m_normalizer = std::make_unique<NormalizationInitial>();
m_normalizer = std::make_unique<NormalizationInitial>(false);
break;
case AppInstaller::Utility::NormalizationVersion::InitialPreserveWhiteSpace:
m_normalizer = std::make_unique<NormalizationInitial>(true);
break;
default:
THROW_HR(E_INVALIDARG);
Expand Down
3 changes: 3 additions & 0 deletions src/AppInstallerCommonCore/Public/AppInstallerStrings.h
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,9 @@ namespace AppInstaller::Utility
// Gets the file name part of the given URI.
std::filesystem::path GetFileNameFromURI(std::string_view uri);

// Splits the string into words.
std::vector<std::string> SplitIntoWords(std::string_view input);

// Converts a container to a string representation of it.
template <typename T, typename U>
std::string ConvertContainerToString(const T& container, U toString)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ namespace AppInstaller::Utility
enum class NormalizationVersion
{
Initial,
InitialPreserveWhiteSpace,
};

struct NameNormalizer;
Expand Down
Loading