diff --git a/conf/schema.xml b/conf/schema.xml index 43eb842..9b4c8a2 100644 --- a/conf/schema.xml +++ b/conf/schema.xml @@ -122,59 +122,11 @@ - - - - - - id - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/conf/solrconfig.xml b/conf/solrconfig.xml index 8a93f5b..4439ef7 100644 --- a/conf/solrconfig.xml +++ b/conf/solrconfig.xml @@ -61,7 +61,7 @@ explicit - text + title AND diff --git a/src/Query.php b/src/Query.php index d796122..623b2ca 100644 --- a/src/Query.php +++ b/src/Query.php @@ -33,6 +33,8 @@ namespace Opus\Search; use InvalidArgumentException; +use Opus\Common\Config; +use Opus\Search\Config as SearchConfig; use Opus\Search\Facet\Set; use Opus\Search\Filter\AbstractFilterBase; use RuntimeException; @@ -42,6 +44,7 @@ use function array_merge; use function array_shift; use function array_unique; +use function boolval; use function count; use function ctype_digit; use function intval; @@ -74,7 +77,7 @@ * @method int getRows( int $default = null ) * @method string[] getFields( array $default = null ) * @method array getSort( array $default = null ) - * @method bool getUnion( bool $default = null ) + * @method bool getUnion( bool $default = false ) * @method AbstractFilterBase getFilter(AbstractFilterBase $default = null ) retrieves condition to be met by resulting documents * @method Set getFacet( Set $default = null ) * @method $this setStart( int $offset ) @@ -86,6 +89,8 @@ * @method $this setFacet( Set $facet ) * @method $this addFields( string $fields ) * @method $this addSort( $sorting ) + * @method $this setWeightedFields( int[] $weightedFields ) assigns boost factors to fields (e.g. [ 'title' => 10, 'abstract' => 0.5 ]) + * @method $this setWeightMultiplier( int $multiplier ) multiplier to further increase boost factors when matching phrases */ class Query { @@ -95,14 +100,16 @@ class Query public function reset() { $this->data = [ - 'start' => null, - 'rows' => null, - 'fields' => null, - 'sort' => null, - 'union' => null, - 'filter' => null, - 'facet' => null, - 'subfilters' => null, + 'start' => null, + 'rows' => null, + 'fields' => null, + 'sort' => null, + 'union' => false, + 'filter' => null, + 'facet' => null, + 'subfilters' => null, + 'weightedfields' => null, + 'weightmultiplier' => null, ]; } @@ -184,6 +191,83 @@ protected function normalizeDirection($ascending) return $ascending; } + /** + * Returns true if a weighted search shall be used, otherwise returns false. + * + * @return bool + */ + public function getWeightedSearch() + { + if (! isset($this->data['weightedsearch'])) { + $config = Config::get(); + + if (isset($config->search->weightedSearch)) { + $this->data['weightedsearch'] = boolval($config->search->weightedSearch); + } else { + $this->data['weightedsearch'] = false; + } + } + + return $this->data['weightedsearch']; + } + + /** + * Set to true if a weighted search shall be used, otherwise set to false. + * + * @param bool $value + * @return $this fluent interface + */ + public function setWeightedSearch($value) + { + $this->data['weightedsearch'] = ! ! $value; + + return $this; + } + + /** + * Returns boost factors keyed by field (e.g. [ 'title' => 10, 'abstract' => 0.5 ]). + * + * @return int[] + */ + public function getWeightedFields() + { + if ($this->data['weightedfields'] === null) { + $config = Config::get(); + + if (isset($config->search->simple)) { + $this->data['weightedfields'] = $config->search->simple->toArray(); + } else { + $this->data['weightedfields'] = []; + } + } + + return $this->data['weightedfields']; + } + + /** + * Returns a positive integer used as a multiplier to further increase field-specific boost factors when + * matching phrases (i.e., in cases where all query terms appear in close proximity). + * + * For example, with a weight multiplier of 5, the weightedfields array [ 'title' => 10, 'abstract' => 0.5 ] + * would be translated to [ 'title' => 50, 'abstract' => 2.5 ] when matching phrases. + * + * @return int + */ + public function getWeightMultiplier() + { + if ($this->data['weightmultiplier'] === null) { + $config = Config::get(); + + if (isset($config->search->weightMultiplier)) { + $this->data['weightmultiplier'] = $config->search->weightMultiplier; + } else { + $this->data['weightmultiplier'] = 1; + } + } + + return $this->data['weightmultiplier']; + } + /** * Retrieves value of selected query parameter. * @@ -214,6 +298,7 @@ public function set($name, $value, $adding = false) switch ($name) { case 'start': case 'rows': + case 'weightmultiplier': if ($adding) { throw new InvalidArgumentException('invalid parameter access on ' . $name); } @@ -300,6 +385,18 @@ public function set($name, $value, $adding = false) case 'subfilters': throw new RuntimeException('invalid access on sub filters'); + + case 'weightedfields': + if ($adding) { + throw new InvalidArgumentException('invalid parameter access on ' . $name); + } + + if (! is_array($value)) { + throw new InvalidArgumentException('invalid query fields option'); + } + + $this->data[$name] = $value; + break; } return $this; @@ -469,7 +566,7 @@ public function getSubFilters() */ public static function getParameterDefault($name, $fallbackIfMissing, $oldName = null) { - $config = Config::getDomainConfiguration(); + $config = SearchConfig::getDomainConfiguration(); $defaults = $config->parameterDefaults; if ($defaults instanceof Zend_Config) { diff --git a/src/Result/Base.php b/src/Result/Base.php index 2d9849f..a71232c 100644 --- a/src/Result/Base.php +++ b/src/Result/Base.php @@ -38,8 +38,10 @@ use Opus\Search\Log; use RuntimeException; +use function array_filter; use function array_key_exists; use function array_map; +use function array_values; use function count; use function ctype_digit; use function intval; @@ -192,9 +194,11 @@ public function getFacet($fieldName) * Retrieves set of matching and locally existing documents returned in * response to some search query. * + * @param bool $ignoreZeroScoreMatches ignore any matches with score 0.0 + * (true) or not (false); defaults to true * @return ResultMatch[] */ - public function getReturnedMatches() + public function getReturnedMatches($ignoreZeroScoreMatches = true) { if ($this->data['matches'] === null) { return []; @@ -208,7 +212,10 @@ public function getReturnedMatches() foreach ($this->data['matches'] as $match) { try { $match->getDocument(); - $matches[] = $match; + $ignoreMatch = $ignoreZeroScoreMatches === true && $match->getScore() === 0.0; + if ($ignoreMatch !== true) { + $matches[] = $match; + } } catch (DocumentException $e) { Log::get()->warn('skipping matching but locally missing document #' . $match->getId()); } @@ -223,18 +230,22 @@ public function getReturnedMatches() * * @note If query was requesting to retrieve non-qualified matches this set * might include IDs of documents that doesn't exist locally anymore. + * @param bool $ignoreZeroScoreMatches ignore any matches with score 0.0 + * (true) or not (false); defaults to true * @return int[] */ - public function getReturnedMatchingIds() + public function getReturnedMatchingIds($ignoreZeroScoreMatches = true) { if ($this->data['matches'] === null) { return []; } - return array_map(function ($match) { - /** @var ResultMatch $match */ - return $match->getId(); + $matchingIds = array_map(function (ResultMatch $match) use ($ignoreZeroScoreMatches) { + $ignoreMatch = $ignoreZeroScoreMatches === true && $match->getScore() === 0.0; + return $ignoreMatch !== true ? $match->getId() : null; }, $this->data['matches']); + + return array_values(array_filter($matchingIds)); } /** @@ -247,7 +258,7 @@ public function getReturnedMatchingIds() * has changed in that it's returning set of Opus_Document instances * rather than set of Opus_Search_Util_Result instances. * @note The wording is less specific in that all information in response to - * search query may considered results of search. Thus this new API + * search query may be considered results of search. Thus this new API * prefers "matches" over "results". */ public function getResults() diff --git a/src/Solr/Solarium/Adapter.php b/src/Solr/Solarium/Adapter.php index bfc2096..96d5a73 100755 --- a/src/Solr/Solarium/Adapter.php +++ b/src/Solr/Solarium/Adapter.php @@ -77,6 +77,7 @@ use function file_exists; use function filesize; use function filter_var; +use function implode; use function in_array; use function intval; use function is_array; @@ -615,6 +616,25 @@ protected function applyParametersOnQuery( $query->setSorts($sortings); } + $isWeightedSearch = $parameters->getWeightedSearch(); + if ($isWeightedSearch === true) { + // get the edismax component + $edismax = $query->getEDisMax(); + + // NOTE: query is now an edismax query + $weightedFields = $parameters->getWeightedFields(); + if (! empty($weightedFields)) { + $queryFields = $this->getQueryFieldsString($weightedFields); + $edismax->setQueryFields($queryFields); + + $weightMultiplier = $parameters->getWeightMultiplier(); + if ($weightMultiplier !== null) { + $phraseFields = $this->getPhraseFieldsString($weightedFields, $weightMultiplier); + $edismax->setPhraseFields($phraseFields); + } + } + } + $facet = $parameters->getFacet(); if ($facet !== null) { $facetSet = $query->getFacetSet(); @@ -880,4 +900,38 @@ public function setTimeout($timeout) $this->client->setOptions($options, true); } } + + /** + * Converts an array containing boost factors keyed by field into a query fields string that can be used + * as input for the Solr `qf` request parameter. + * + * @param int[] $weightedFields assigns boost factors to fields, e.g.: [ 'title' => 10, 'abstract' => 0.5 ] + * @return string query fields string, e.g.: "title^10 abstract^0.5" + */ + protected function getQueryFieldsString($weightedFields) + { + $queryFields = []; + foreach ($weightedFields as $field => $boostFactor) { + $queryFields[] = "$field^$boostFactor"; + } + + return implode(' ', $queryFields); + } + + /** + * Generates a phrase fields string that can be used as input for the Solr `pf` request parameter. + * + * @param int[] $weightedFields assigns boost factors to fields, e.g.: [ 'title' => 10, 'abstract' => 0.5 ] + * @param int $weightMultiplier factor by which each boost factor will be multiplied when matching phrases, e.g.: 5 + * @return string phrase fields string, e.g.: "title^50 abstract^2.5" + */ + protected function getPhraseFieldsString($weightedFields, $weightMultiplier) + { + $phraseFields = []; + foreach ($weightedFields as $field => $boostFactor) { + $phraseFields[] = "$field^" . $boostFactor * $weightMultiplier; + } + + return implode(' ', $phraseFields); + } } diff --git a/test/QueryTest.php b/test/QueryTest.php index a1ec897..fc2fb7c 100644 --- a/test/QueryTest.php +++ b/test/QueryTest.php @@ -49,7 +49,7 @@ public function testInitiallyEmpty() $this->assertFalse(isset($query->rows)); $this->assertFalse(isset($query->fields)); $this->assertFalse(isset($query->sort)); - $this->assertFalse(isset($query->union)); + $this->assertFalse($query->union); } public function testSupportingExplicitGetter() @@ -60,7 +60,7 @@ public function testSupportingExplicitGetter() $this->assertNull($query->get('rows')); $this->assertNull($query->get('fields')); $this->assertNull($query->get('sort')); - $this->assertNull($query->get('union')); + $this->assertFalse($query->get('union')); } public function testSupportingImplicitGetter() @@ -71,7 +71,7 @@ public function testSupportingImplicitGetter() $this->assertNull($query->rows); $this->assertNull($query->fields); $this->assertNull($query->sort); - $this->assertNull($query->union); + $this->assertFalse($query->union); } public function testSupportingGetterMethods() @@ -82,7 +82,7 @@ public function testSupportingGetterMethods() $this->assertNull($query->getRows()); $this->assertNull($query->getFields()); $this->assertNull($query->getSort()); - $this->assertNull($query->getUnion()); + $this->assertFalse($query->getUnion()); } /** diff --git a/test/Solr/Solarium/AdapterSearchingTest.php b/test/Solr/Solarium/AdapterSearchingTest.php index f64d17c..e1c7d3c 100644 --- a/test/Solr/Solarium/AdapterSearchingTest.php +++ b/test/Solr/Solarium/AdapterSearchingTest.php @@ -33,19 +33,77 @@ namespace OpusTest\Search\Solr\Solarium; use Exception; +use Opus\Common\Document; use Opus\Common\Person; use Opus\Search\Query; use Opus\Search\QueryFactory; +use Opus\Search\SearchingInterface; use Opus\Search\Service; use Opus\Search\Solr\Solarium\Adapter; use Opus\Search\Util\Query as QueryUtil; use Opus\Search\Util\Searcher; use OpusTest\Search\TestAsset\DocumentBasedTestCase; +use function abs; use function count; +use function in_array; class AdapterSearchingTest extends DocumentBasedTestCase { + /** @var array[] */ + protected static $additionalDocumentPropertySets = [ + 'weightedTestDocA' => [ + 'TitleMain' => [ + 'Value' => 'Some Document', + 'Language' => 'eng', + ], + 'TitleAbstract' => [ + 'Value' => 'Abstract A, full query string (test document) only occurs in abstract.', + 'Language' => 'eng', + ], + ], + 'weightedTestDocB' => [ + 'TitleMain' => [ + 'Value' => 'Another Test Document', + 'Language' => 'eng', + ], + 'TitleAbstract' => [ + 'Value' => 'Abstract of document B, full query string only occurs in title.', + 'Language' => 'eng', + ], + ], + 'weightedTestDocC' => [ + 'TitleMain' => [ + 'Value' => 'Third One', + 'Language' => 'eng', + ], + 'TitleAbstract' => [ + 'Value' => 'Abstract C, first query term (test) only occurs in abstract.\nSome more text.', + 'Language' => 'eng', + ], + ], + 'weightedTestDocD' => [ + 'TitleMain' => [ + 'Value' => 'Fourth One', + 'Language' => 'eng', + ], + 'TitleAbstract' => [ + 'Value' => 'Abstract D, second query term (document) only occurs in abstract.\nEven more text.', + 'Language' => 'eng', + ], + ], + 'weightedTestDocE' => [ + 'TitleMain' => [ + 'Value' => 'Yet Another Test', + 'Language' => 'eng', + ], + 'TitleAbstract' => [ + 'Value' => 'Abstract of document E, title & abstract contain one query term each.', + 'Language' => 'eng', + ], + ], + ]; + public function testService() { $search = Service::selectSearchingService(null, 'solr'); @@ -210,22 +268,28 @@ public function testSearchWithDiacritics() $docB->addPersonAuthor($author); $docB->store(); - $index = Service::selectIndexingService(null, 'solr'); - $index->addDocumentsToIndex([$docA, $docB]); + $this->indexDocuments([$docA, $docB]); - $search = new Searcher(); + $search = Service::selectSearchingService(null, 'solr'); + $query = $this->queryWithSearchString($search, 'muller'); - $query = new QueryUtil(QueryUtil::SIMPLE); - $query->setCatchAll('muller'); - $result = $search->search($query); + $query->setWeightedSearch(true); + $query->setWeightedFields(['author' => 1.0]); - $this->assertEquals(2, $result->getAllMatchesCount()); + $result = $search->customSearch($query); + $matchingIds = $result->getReturnedMatchingIds(); - $query = new QueryUtil(QueryUtil::SIMPLE); - $query->setCatchAll('müller'); - $result = $search->search($query); + $this->assertEquals(2, count($matchingIds)); - $this->assertEquals(2, $result->getAllMatchesCount()); + $filter = $search->createFilter(); + $filter->createSimpleEqualityFilter('*')->addValue('müller'); + $query->setFilter($filter); + + $result = $search->customSearch($query); + $matchingIds = $result->getReturnedMatchingIds(); + + // when searching with diacritics, expect the same documents being found + $this->assertEquals(2, count($matchingIds)); } public function testMapYearFacetIndexFieldsToYearAsset() @@ -248,4 +312,377 @@ public function testMapYearFacetIndexFieldsToYearAsset() $this->assertEquals(1, $result->getAllMatchesCount()); } + + /** + * Test that a standard `AND` search (which uses Solr's standard query parser) + * finds all documents that contain both query terms in the default field ('title'). + */ + public function testStandardAndSearch() + { + $docA = $this->createDocument('weightedTestDocA'); // full query string only occurs in abstract + $docB = $this->createDocument('weightedTestDocB'); // full query string only occurs in title + $docC = $this->createDocument('weightedTestDocC'); // has only one query term (in abstract) + $docD = $this->createDocument('weightedTestDocD'); // has only one query term (in abstract) + $docE = $this->createDocument('weightedTestDocE'); // title & abstract contain one query term each + $this->indexDocuments([$docA, $docB, $docC, $docD, $docE]); + + $search = Service::selectSearchingService(null, 'solr'); + $query = $this->queryWithSearchString($search, 'test document'); + + // use Solr's standard query parser (which, as currently configured + // in solarconfig.xml, by default only searches the 'title' field) + $query->setWeightedSearch(false); + $query->setUnion(false); // use AND as default query operator + + $result = $search->customSearch($query); + $matchingIds = $result->getReturnedMatchingIds(); + + $this->assertEquals(1, count($matchingIds)); + + // expect only documents that contain both query terms in the default field ('title') + $this->assertTrue(in_array($docB->getId(), $matchingIds)); + } + + /** + * Test that a weighted `AND` search finds all documents that contain both + * query terms in the same field. + */ + public function testWeightedAndSearchWithoutBoosts() + { + $docA = $this->createDocument('weightedTestDocA'); // full query string only occurs in abstract + $docB = $this->createDocument('weightedTestDocB'); // full query string only occurs in title + $docC = $this->createDocument('weightedTestDocC'); // has only one query term (in abstract) + $docD = $this->createDocument('weightedTestDocD'); // has only one query term (in abstract) + $docE = $this->createDocument('weightedTestDocE'); // title & abstract contain one query term each + $this->indexDocuments([$docA, $docB, $docC, $docD, $docE]); + + $search = Service::selectSearchingService(null, 'solr'); + $query = $this->queryWithSearchString($search, 'test document'); + + $query->setWeightedSearch(true); // use Solr's eDisMax query parser + $query->setWeightedFields(['abstract' => 1.0, 'title' => 1.0]); // assigns boost factors to fields + $query->setUnion(false); // use AND as default query operator + + $result = $search->customSearch($query); + $matchingIds = $result->getReturnedMatchingIds(); + + $this->assertEquals(2, count($matchingIds)); + + // expect only documents that contain both query terms in the same field + $this->assertTrue(in_array($docA->getId(), $matchingIds)); + $this->assertTrue(in_array($docB->getId(), $matchingIds)); + } + + /** + * Test that a weighted `OR` search finds all documents that contain at least + * one query term in one of their fields. + */ + public function testWeightedOrSearchWithoutBoosts() + { + $docA = $this->createDocument('weightedTestDocA'); // full query string only occurs in abstract + $docB = $this->createDocument('weightedTestDocB'); // full query string only occurs in title + $docC = $this->createDocument('weightedTestDocC'); // has only one query term (in abstract) + $docD = $this->createDocument('weightedTestDocD'); // has only one query term (in abstract) + $docE = $this->createDocument('weightedTestDocE'); // title & abstract contain one query term each + $this->indexDocuments([$docA, $docB, $docC, $docD, $docE]); + + $search = Service::selectSearchingService(null, 'solr'); + $query = $this->queryWithSearchString($search, 'test document'); + + $query->setWeightedSearch(true); + $query->setWeightedFields(['abstract' => 1.0, 'title' => 1.0]); + $query->setUnion(true); // use OR as default query operator + + $result = $search->customSearch($query); + $matches = $result->getReturnedMatches(); + + // expect all of the above documents to get found + $this->assertEquals(5, count($matches)); + } + + /** + * Test that a weighted `OR` search with boosted phrase matching results in increased + * importance given to search results containing an exact occurrence of the search string. + */ + public function testWeightedOrSearchWithBoostedPhraseMatching() + { + $docA = $this->createDocument('weightedTestDocA'); // full query string only occurs in abstract + $docB = $this->createDocument('weightedTestDocB'); // full query string only occurs in title + $docC = $this->createDocument('weightedTestDocC'); // has only one query term (in abstract) + $docD = $this->createDocument('weightedTestDocD'); // has only one query term (in abstract) + $docE = $this->createDocument('weightedTestDocE'); // title & abstract contain one query term each + $this->indexDocuments([$docA, $docB, $docC, $docD, $docE]); + + $search = Service::selectSearchingService(null, 'solr'); + $query = $this->queryWithSearchString($search, 'test document'); + + $query->setWeightedSearch(true); + $query->setWeightedFields(['abstract' => 1.0, 'title' => 1.0]); + $query->setWeightMultiplier(5); // multiplier to further increase boost factors when matching phrases + $query->setUnion(true); // use OR as default query operator + + $result = $search->customSearch($query); + $matches = $result->getReturnedMatches(); + + $this->assertEquals(5, count($matches)); + + // expect the two documents matching the exact occurrence of the search string to sort first + $highestScoringIds = [$matches[0]->getDocument()->getId(), $matches[1]->getDocument()->getId()]; + $this->assertTrue(in_array($docA->getId(), $highestScoringIds)); + $this->assertTrue(in_array($docB->getId(), $highestScoringIds)); + + // expect much greater scores for the two documents matching the exact occurrence of the search string + $this->assertTrue($matches[0]->getScore() > 1.0); + $this->assertTrue($matches[1]->getScore() > 1.0); + + $this->assertTrue($matches[2]->getScore() < 1.0); + $this->assertTrue($matches[3]->getScore() < 1.0); + $this->assertTrue($matches[4]->getScore() < 1.0); + } + + /** + * Test that a weighted `AND` search with a field's boost factor set to 0 will + * cause a document with a match just in that field to get a score of 0. + */ + public function testWeightedAndSearchWithZeroedBoost() + { + $docA = $this->createDocument('weightedTestDocA'); // full query string only occurs in abstract + $docB = $this->createDocument('weightedTestDocB'); // full query string only occurs in title + $docE = $this->createDocument('weightedTestDocE'); // title & abstract contain one query term each + $this->indexDocuments([$docA, $docB, $docE]); + + $search = Service::selectSearchingService(null, 'solr'); + $query = $this->queryWithSearchString($search, 'test document'); + + $query->setWeightedSearch(true); + $query->setWeightedFields(['abstract' => 0, 'title' => 1.0]); + $query->setUnion(false); // use AND as default query operator + + $result = $search->customSearch($query); + $matches = $result->getReturnedMatches(false); + + // expect only docA & docB to get found (which both contain the full query string in one of their fields) + $this->assertEquals(2, count($matches)); + + // expect docB (contains full query string in title) to sort first and with a score greater than 0 + $this->assertEquals($docB->getId(), $matches[0]->getDocument()->getId()); + $this->assertTrue($matches[0]->getScore() > 0.0); + + // expect docA (contains full query string in abstract) to sort last and with a score of 0 + $this->assertEquals($docA->getId(), $matches[1]->getDocument()->getId()); + $this->assertTrue($matches[1]->getScore() === 0.0); + } + + /** + * Test that a weighted `OR` search with a field's boost factor set to 0 will + * cause a document with a match just in that field to get a score of 0. + */ + public function testWeightedOrSearchWithZeroedBoost() + { + $docB = $this->createDocument('weightedTestDocB'); // full query string only occurs in title + $docD = $this->createDocument('weightedTestDocD'); // has only one query term (in abstract) + $docE = $this->createDocument('weightedTestDocE'); // title & abstract contain one query term each + $this->indexDocuments([$docB, $docD, $docE]); + + $search = Service::selectSearchingService(null, 'solr'); + $query = $this->queryWithSearchString($search, 'test document'); + + $query->setWeightedSearch(true); + $query->setWeightedFields(['abstract' => 0, 'title' => 1.0]); + $query->setUnion(true); // use OR as default query operator + + $result = $search->customSearch($query); + $matches = $result->getReturnedMatches(false); + + // expect all documents to get found since all of them contain at least one query term in one of their fields + $this->assertEquals(3, count($matches)); + + // expect docB (contains full query string in title) to sort first and with a score greater than 0 + $this->assertEquals($docB->getId(), $matches[0]->getDocument()->getId()); + $this->assertTrue($matches[0]->getScore() > 0.0); + + // expect docE (contains part of query string in title) to sort in the middle and with a score greater than 0 + $this->assertEquals($docE->getId(), $matches[1]->getDocument()->getId()); + $this->assertTrue($matches[1]->getScore() > 0.0); + + // expect docD (contains part of query string in abstract) to sort last and with a score of 0 + $this->assertEquals($docD->getId(), $matches[2]->getDocument()->getId()); + $this->assertTrue($matches[2]->getScore() === 0.0); + } + + /** + * Test that a weighted `AND` search with a field's boost factor set to 0 will + * by default ignore any matches with a score of 0. + */ + public function testWeightedAndSearchLeavingOutZeroScoredMatches() + { + $docA = $this->createDocument('weightedTestDocA'); // full query string only occurs in abstract + $docB = $this->createDocument('weightedTestDocB'); // full query string only occurs in title + $docE = $this->createDocument('weightedTestDocE'); // title & abstract contain one query term each + $this->indexDocuments([$docA, $docB, $docE]); + + $search = Service::selectSearchingService(null, 'solr'); + $query = $this->queryWithSearchString($search, 'test document'); + + $query->setWeightedSearch(true); + $query->setWeightedFields(['abstract' => 0, 'title' => 1.0]); + $query->setUnion(false); // use AND as default query operator + + $result = $search->customSearch($query); + $matches = $result->getReturnedMatches(); // by default, ignores any matches with score 0.0 + + // expect only docB to get found, since docA only contains the full query string in the (ignored) abstract + $this->assertEquals(1, count($matches)); + + // expect just docB (contains full query string in title) with a score greater than 0 + $this->assertEquals($docB->getId(), $matches[0]->getDocument()->getId()); + $this->assertTrue($matches[0]->getScore() > 0.0); + } + + /** + * Test that a "weighted" search with undefined weights (i.e. no field-specific boost factors defined at all) + * will cause Solr to fall back to its standard query parser (which by default only searches the title field). + */ + public function testWeightedSearchWithUndefinedWeights() + { + $docA = $this->createDocument('weightedTestDocA'); // full query string only occurs in abstract + $docB = $this->createDocument('weightedTestDocB'); // full query string only occurs in title + $this->indexDocuments([$docA, $docB]); + + $search = Service::selectSearchingService(null, 'solr'); + $query = $this->queryWithSearchString($search, 'test document'); + + $query->setWeightedSearch(true); + + // without any boost factors assigned to fields, expect only docB + // (which contains the query string in the title) to be found + $query->setWeightedFields([]); // defining no weights causes Solr to fall back to its standard query parser + + $result = $search->customSearch($query); + $matches = $result->getReturnedMatches(); + + $this->assertEquals(1, count($matches)); + + $this->assertEquals($docB->getId(), $matches[0]->getDocument()->getId()); + } + + /** + * Test that a weighted search with equal weights (i.e. no fields being boosted) will result in + * similar scores for two documents that both contain the full query string in one of their fields. + */ + public function testWeightedSearchWithEqualWeights() + { + $docA = $this->createDocument('weightedTestDocA'); // full query string only occurs in abstract + $docB = $this->createDocument('weightedTestDocB'); // full query string only occurs in title + $this->indexDocuments([$docA, $docB]); + + $search = Service::selectSearchingService(null, 'solr'); + $query = $this->queryWithSearchString($search, 'test document'); + + $query->setWeightedSearch(true); + + // with equal boost factors, expect both documents being returned with roughly equal scores + $query->setWeightedFields(['abstract' => 1.0, 'title' => 1.0]); + + $result = $search->customSearch($query); + $matches = $result->getReturnedMatches(); + + $this->assertEquals(2, count($matches)); + + $this->assertTrue(abs($matches[0]->getScore() - $matches[1]->getScore()) < 1.0); + } + + /** + * Test that a weighted search with different boost factors assigned to fields will influence + * result scores accordingly & cause a document with a match in a boosted field to sort first. + */ + public function testWeightedSearchWithBoostedFields() + { + $docA = $this->createDocument('weightedTestDocA'); // full query string only occurs in abstract + $docB = $this->createDocument('weightedTestDocB'); // full query string only occurs in title + $this->indexDocuments([$docA, $docB]); + + $search = Service::selectSearchingService(null, 'solr'); + $query = $this->queryWithSearchString($search, 'test document'); + + $this->adjustConfiguration([ + 'search' => [ + 'weightedSearch' => true, // use the Solr eDisMax query parser + 'simple' => [ + 'abstract' => 0.5, // decrease importance of abstract field + 'title' => 10, // increase importance of title field + ], + ], + ]); + + $result = $search->customSearch($query); + $matches = $result->getReturnedMatches(); + + $this->assertEquals(2, count($matches)); + + // expect clearly different scores between a document with a match in the boosted title field and one without + $this->assertTrue(abs($matches[0]->getScore() - $matches[1]->getScore()) > 1.0); + + // expect the document containing the query string in the boosted title field to sort first + $this->assertEquals($docB->getId(), $matches[0]->getDocument()->getId()); + } + + /** + * Test that a weighted search with (compared to the previous test) swapped boost factors + * will also cause the sort order of search results to get swapped. + */ + public function testWeightedSearchWithBoostedFieldsSwapped() + { + $docA = $this->createDocument('weightedTestDocA'); // full query string only occurs in abstract + $docB = $this->createDocument('weightedTestDocB'); // full query string only occurs in title + $this->indexDocuments([$docA, $docB]); + + $search = Service::selectSearchingService(null, 'solr'); + $query = $this->queryWithSearchString($search, 'test document'); + + $query->setWeightedSearch(true); + $query->setWeightedFields(['abstract' => 10.0, 'title' => 0.5]); // increase importance of abstract field + + $result = $search->customSearch($query); + $matches = $result->getReturnedMatches(); + + $this->assertEquals(2, count($matches)); + + // expect clearly different scores between a document with a match in the boosted abstract field and one without + $this->assertTrue(abs($matches[0]->getScore() - $matches[1]->getScore()) > 1.0); + + // expect the document containing the query string in the boosted abstract field to sort first + $this->assertEquals($docA->getId(), $matches[0]->getDocument()->getId()); + } + + /** + * Adds the given documents to the Solr index. + * + * @param Document[] $documents documents to be indexed + */ + protected function indexDocuments($documents) + { + $index = Service::selectIndexingService(null, 'solr'); + $index->addDocumentsToIndex($documents); + } + + /** + * Returns a query object for the given search string sorting results by score in descending order. + * + * @param SearchingInterface $search searching service to work with + * @param string $searchString query string to search for + * @return Query + */ + protected function queryWithSearchString($search, $searchString) + { + $query = new Query(); + $query->addSorting('score', false); + + // add query terms + $filter = $search->createFilter(); + $filter->createSimpleEqualityFilter('*')->addValue($searchString); + $query->setFilter($filter); + + return $query; + } } diff --git a/test/TestAsset/DocumentBasedTestCase.php b/test/TestAsset/DocumentBasedTestCase.php index 561a48e..54bfcaf 100644 --- a/test/TestAsset/DocumentBasedTestCase.php +++ b/test/TestAsset/DocumentBasedTestCase.php @@ -44,6 +44,7 @@ use ReflectionClass; use function array_key_exists; +use function array_merge; use function array_values; use function basename; use function file_get_contents; @@ -155,12 +156,15 @@ class DocumentBasedTestCase extends TestCase ], ]; + /** @var array[] */ + protected static $additionalDocumentPropertySets; + /** * @return array */ public static function documentPropertiesProvider() { - return self::$documentPropertySets; + return array_merge(static::$documentPropertySets, static::$additionalDocumentPropertySets ?? []); } /** @@ -169,11 +173,13 @@ public static function documentPropertiesProvider() */ public static function getDocumentDescriptionByName($name) { - if (! array_key_exists($name, self::$documentPropertySets)) { + $documentPropertySets = self::documentPropertiesProvider(); + + if (! array_key_exists($name, $documentPropertySets)) { throw new InvalidArgumentException("unknown document description"); } - return self::$documentPropertySets[$name]; + return $documentPropertySets[$name]; } /** @@ -187,9 +193,9 @@ public static function getDocumentDescriptionByName($name) protected function createDocument($documentProperties = null) { if ($documentProperties === null) { - $documentProperties = self::$documentPropertySets['article']; + $documentProperties = self::getDocumentDescriptionByName('article'); } if (is_string($documentProperties)) { - $documentProperties = self::$documentPropertySets[$documentProperties]; + $documentProperties = self::getDocumentDescriptionByName($documentProperties); } $document = Document::new(); diff --git a/test/Util/SearcherTest.php b/test/Util/SearcherTest.php index d2e4c1f..57189e5 100644 --- a/test/Util/SearcherTest.php +++ b/test/Util/SearcherTest.php @@ -37,6 +37,7 @@ use Opus\Common\Document; use Opus\Common\DocumentInterface; use Opus\Common\Model\ModelException; +use Opus\Common\Person; use Opus\Model\Xml; use Opus\Model\Xml\Cache; use Opus\Model\Xml\Version1; @@ -497,4 +498,47 @@ public function testFilterFacetQueriesByServerStatePublishedForUsers() { $this->markTestIncomplete('test not implemented yet - waiting for refactoring of isAdmin implementation'); } + + public function testAdvancedSearch() + { + $rows = 5; + $ids = []; + for ($i = 0; $i < $rows; $i++) { + $document = Document::new(); + $document->setServerState('published'); + $document->store(); + array_push($ids, $document->getId()); + } + + $doc = Document::get($ids[0]); + $author = Person::new(); + $author->setLastName('Doe'); + $author = $doc->addPersonAuthor($author); + $doc->store(); + + $doc = Document::get($ids[3]); + $author = Person::new(); + $author->setLastName('doe'); + $author = $doc->addPersonAuthor($author); + $doc->store(); + + $query = new Query(Query::ADVANCED); + $query->setStart(0); + $query->setRows(10); + $query->setSortField('score'); + $query->setSortOrder('desc'); + $query->setFilterQueries([]); + $query->setCatchAll(null); + $query->setFacetField(null); + $query->setReturnIdsOnly(false); + $query->setField('author', 'doe'); + $query->getQ(); + + $searcher = new Searcher(); + $results = $searcher->search($query); + + $this->assertEquals(2, $results->getAllMatchesCount()); + $this->assertContains($ids[0], $results->getReturnedMatchingIds()); + $this->assertContains($ids[3], $results->getReturnedMatchingIds()); + } }