From cf1ce9b65367db7cc4f19f417fbe3a2544a436d4 Mon Sep 17 00:00:00 2001 From: Ere Maijala <ere.maijala@helsinki.fi> Date: Wed, 8 Jan 2014 11:39:00 -0500 Subject: [PATCH] Added Solr deduplication support. - See https://vufind.org/wiki/deduplication --- config/vufind/searches.ini | 8 + languages/en-gb.ini | 1 + languages/en.ini | 13 +- languages/fi.ini | 1 + languages/sv.ini | 1 + .../src/VuFind/RecordDriver/SolrDefault.php | 13 + .../Factory/AbstractSolrBackendFactory.php | 25 ++ .../Search/Solr/DeduplicationListener.php | 337 ++++++++++++++++++ .../SolrDefault/result-list.phtml | 27 ++ .../SolrDefault/result-list.phtml | 25 ++ .../SolrDefault/result-list.phtml | 25 ++ 11 files changed, 470 insertions(+), 6 deletions(-) create mode 100644 module/VuFind/src/VuFind/Search/Solr/DeduplicationListener.php diff --git a/config/vufind/searches.ini b/config/vufind/searches.ini index 3e8ab9cae84..828b736bb05 100644 --- a/config/vufind/searches.ini +++ b/config/vufind/searches.ini @@ -465,3 +465,11 @@ container_title = "Journal Title" [RawHiddenFilters] ;0 = "format:\"Book\" OR format:\"Journal\"" ;1 = "language:\"English\" OR language:\"French\"" + +; This section defines how records are handled when being fetched from Solr. +;[Records] +; Boolean value indicating if deduplication is enabled. Defaults to false. +;deduplication = true +; Priority order (descending) for record sources (record ID prefixes separated +; from the actual record by period, e.g. testsrc.12345) +;sources = alli,testsrc diff --git a/languages/en-gb.ini b/languages/en-gb.ini index 1f5f69ce075..6ece01780a7 100644 --- a/languages/en-gb.ini +++ b/languages/en-gb.ini @@ -548,6 +548,7 @@ original = "Original" Other Authors = "Other Authors" Other Editions = "Other Editions" Other Libraries = "Other Libraries" +Other Sources = "Other Sources" Password = Password Password Again = "Password Again" Password cannot be blank = "Password cannot be blank" diff --git a/languages/en.ini b/languages/en.ini index e73b9d0ab24..7088650c23e 100644 --- a/languages/en.ini +++ b/languages/en.ini @@ -202,9 +202,9 @@ date_year_placeholder = "Y" Days = Days Debug Information = "Debug Information" Delete = Delete +delete_all = "Delete All" delete_comment_failure = "Could not delete comment." delete_comment_success = "Comment deleted." -delete_all = "Delete All" delete_list = "Delete List" delete_page = "Delete Page" delete_selected = "Delete Selected" @@ -300,8 +300,8 @@ Find New Items = "Find New Items" Finding Aid = "Finding Aid" Fine = Fine Fines = Fines -First = First fine_limit_patron = "You have reached your fines limit and cannot renew items" +First = First First Name = "First Name" fix_metadata = "Yes, fix the metadata; I'll wait" for search = "for search" @@ -548,6 +548,7 @@ original = "Original" Other Authors = "Other Authors" Other Editions = "Other Editions" Other Libraries = "Other Libraries" +Other Sources = "Other Sources" Password = Password Password Again = "Password Again" Password cannot be blank = "Password cannot be blank" @@ -712,12 +713,12 @@ System Unavailable = "System Unavailable" Table of Contents = "Table of Contents" Table of Contents unavailable = "Table of Contents unavailable" Tag = Tag -tag_delete_warning = "Warning! You are about to delete %count% resource tag(s)" -tag_delete_filter = "You are using the following filter - Username: %username%, Tag: %tag%, Resource: %resource%" -tag_filter_empty = "No tags are available for this filter" Tag Management = "Tag Management" Tags = Tags tags_deleted = "%count% tag(s) deleted" +tag_delete_filter = "You are using the following filter - Username: %username%, Tag: %tag%, Resource: %resource%" +tag_delete_warning = "Warning! You are about to delete %count% resource tag(s)" +tag_filter_empty = "No tags are available for this filter" test_fail = "Failed" test_fix = "Fix" test_ok = "OK" @@ -752,9 +753,9 @@ total_tags = "Total Tags" total_users = "Total Users" tree_search_limit_reached_html = "Your search returned too many results to display in the tree. Showing only the first <b>%%limit%%</b> items. For a full search click <a id="fullSearchLink" href="%%url%%" target="_blank">here.</a>" unique_tags = "Unique Tags" +unique_tags = "Unique Tags" University Library = "University Library" Unknown = Unknown -unique_tags = "Unique Tags" Upgrade VuFind = "Upgrade VuFind" upgrade_description = "If you are upgrading a previous VuFind version, you can load your old settings with this tool." Use for = "Use for" diff --git a/languages/fi.ini b/languages/fi.ini index a90c8b5a468..2321cd6ecbb 100644 --- a/languages/fi.ini +++ b/languages/fi.ini @@ -551,6 +551,7 @@ original = "Alkuperäinen" Other Authors = "Muut tekijät" Other Editions = "Muut painokset" Other Libraries = "Muut kirjastot" +Other Sources = "Muut lähteet" Password = "Salasana / PIN *" Password Again = "Salasana uudelleen" Password cannot be blank = "Salasana ei voi olla tyhjä" diff --git a/languages/sv.ini b/languages/sv.ini index 6a45703970b..93ab60a55ae 100644 --- a/languages/sv.ini +++ b/languages/sv.ini @@ -551,6 +551,7 @@ original = "Original" Other Authors = "Andra upphovsmän" Other Editions = "Andra upplagor" Other Libraries = "Andra bibliotek" +Other Sources = "Andra källor" Password = "Lösenord / PIN *" Password Again = "Lösenord igen" Password cannot be blank = "Lösenordet kan inte lämnas tom" diff --git a/module/VuFind/src/VuFind/RecordDriver/SolrDefault.php b/module/VuFind/src/VuFind/RecordDriver/SolrDefault.php index f15120807f8..2a315b9da5b 100644 --- a/module/VuFind/src/VuFind/RecordDriver/SolrDefault.php +++ b/module/VuFind/src/VuFind/RecordDriver/SolrDefault.php @@ -1608,6 +1608,7 @@ class SolrDefault extends AbstractBase } return array_keys($types); } + /** * Get schema.org type mapping, expected to be a space-delimited string of * sub-types of http://schema.org/CreativeWork, defaulting to CreativeWork @@ -1619,4 +1620,16 @@ class SolrDefault extends AbstractBase { return implode(' ', $this->getSchemaOrgFormatsArray()); } + + /** + * Get information on records deduplicated with this one + * + * @return array Array keyed by source id containing record id + */ + public function getDedupData() + { + return isset($this->fields['dedup_data']) + ? $this->fields['dedup_data'] + : array(); + } } diff --git a/module/VuFind/src/VuFind/Search/Factory/AbstractSolrBackendFactory.php b/module/VuFind/src/VuFind/Search/Factory/AbstractSolrBackendFactory.php index 3146ee7a9b9..0c15d74d4e1 100644 --- a/module/VuFind/src/VuFind/Search/Factory/AbstractSolrBackendFactory.php +++ b/module/VuFind/src/VuFind/Search/Factory/AbstractSolrBackendFactory.php @@ -34,6 +34,7 @@ use VuFind\Search\Solr\InjectSpellingListener; use VuFind\Search\Solr\MultiIndexListener; use VuFind\Search\Solr\V3\ErrorListener as LegacyErrorListener; use VuFind\Search\Solr\V4\ErrorListener; +use VuFind\Search\Solr\DeduplicationListener; use VuFindSearch\Backend\BackendInterface; use VuFindSearch\Backend\Solr\LuceneSyntaxHelper; @@ -192,6 +193,14 @@ abstract class AbstractSolrBackendFactory implements FactoryInterface $mindexListener->attach($events); } + // Apply deduplication if applicable: + if (isset($search->Records->deduplication) + && $search->Records->deduplication + ) { + $deduplicationListener = $this->getDeduplicationListener($backend); + $deduplicationListener->attach($events); + } + // Attach error listeners for Solr 3.x and Solr 4.x (for backward // compatibility with VuFind 1.x instances). $legacyErrorListener = new LegacyErrorListener($backend); @@ -316,4 +325,20 @@ abstract class AbstractSolrBackendFactory implements FactoryInterface return $this->serviceLocator->get('VuFind\SearchSpecsReader') ->get($this->searchYaml); } + + /** + * Get a deduplication listener for the backend + * + * @param BackendInterface $backend Search backend + * + * @return \VuFind\Search\Solr\DeduplicationListener + */ + protected function getDeduplicationListener(BackendInterface $backend) + { + return new DeduplicationListener( + $backend, + $this->serviceLocator, + $this->searchConfig + ); + } } \ No newline at end of file diff --git a/module/VuFind/src/VuFind/Search/Solr/DeduplicationListener.php b/module/VuFind/src/VuFind/Search/Solr/DeduplicationListener.php new file mode 100644 index 00000000000..62a869c4284 --- /dev/null +++ b/module/VuFind/src/VuFind/Search/Solr/DeduplicationListener.php @@ -0,0 +1,337 @@ +<?php + +/** + * Solr deduplication (merged records) listener. + * + * PHP version 5 + * + * Copyright (C) Villanova University 2013. + * Copyright (C) The National Library of Finland 2013. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * @category VuFind2 + * @package Search + * @author David Maus <maus@hab.de> + * @author Ere Maijala <ere.maijala@helsinki.fi> + * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License + * @link http://vufind.org Main Site + */ + +namespace VuFind\Search\Solr; + +use VuFindSearch\Backend\BackendInterface; + +use Zend\EventManager\SharedEventManagerInterface; +use Zend\EventManager\EventInterface; +use Zend\ServiceManager\ServiceLocatorInterface; + +/** + * Solr merged record handling listener. + * + * @category VuFind2 + * @package Search + * @author David Maus <maus@hab.de> + * @author Ere Maijala <ere.maijala@helsinki.fi> + * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License + * @link http://vufind.org Main Site + */ +class DeduplicationListener +{ + /** + * Backend. + * + * @var BackendInterface + */ + protected $backend; + + /** + * Superior service manager. + * + * @var ServiceLocatorInterface + */ + protected $serviceLocator; + + /** + * Search configuration file identifier. + * + * @var string + */ + protected $searchConfig; + + /** + * Data source configuration file identifier. + * + * @var string + */ + protected $dataSourceConfig; + + /** + * Constructor. + * + * @param BackendInterface $backend Search backend + * @param ServiceLocatorInterface $serviceLocator Service locator + * @param string $searchConfig Search config file id + * @param string $dataSourceConfig Data source file id + * + * @return void + */ + public function __construct( + BackendInterface $backend, + ServiceLocatorInterface $serviceLocator, + $searchConfig, $dataSourceConfig = 'datasources' + ) { + $this->backend = $backend; + $this->serviceLocator = $serviceLocator; + $this->searchConfig = $searchConfig; + $this->dataSourceConfig = $dataSourceConfig; + } + + /** + * Attach listener to shared event manager. + * + * @param SharedEventManagerInterface $manager Shared event manager + * + * @return void + */ + public function attach( + SharedEventManagerInterface $manager + ) { + $manager->attach('VuFind\Search', 'pre', array($this, 'onSearchPre')); + $manager->attach('VuFind\Search', 'post', array($this, 'onSearchPost')); + } + + /** + * Set up filter for excluding merge children. + * + * @param EventInterface $event Event + * + * @return EventInterface + */ + public function onSearchPre(EventInterface $event) + { + $backend = $event->getTarget(); + if ($backend === $this->backend) { + $params = $event->getParam('params'); + $context = $event->getParam('context'); + if (($context == 'search' || $context == 'similar') && $params) { + $params->add('fq', '-merged_child_boolean:TRUE'); + } + } + return $event; + } + + /** + * Fetch appropriate dedup child + * + * @param EventInterface $event Event + * + * @return EventInterface + */ + public function onSearchPost(EventInterface $event) + { + // Inject deduplication details into record objects: + $backend = $event->getParam('backend'); + + if ($backend != $this->backend->getIdentifier()) { + return $event; + } + $context = $event->getParam('context'); + if ($context == 'search') { + $this->fetchLocalRecords($event); + } + return $event; + } + + /** + * Fetch local records for all the found dedup records + * + * @param EventInterface $event Event + * + * @return void + */ + protected function fetchLocalRecords($event) + { + $config = $this->serviceLocator->get('VuFind\Config'); + $searchConfig = $config->get($this->searchConfig); + $dataSourceConfig = $config->get($this->dataSourceConfig); + $recordSources = isset($searchConfig->Records->sources) + ? $searchConfig->Records->sources + : ''; + $sourcePriority = $this->determineSourcePriority($recordSources); + $params = $event->getParam('params'); + $buildingPriority = $this->determineBuildingPriority($params); + + $idList = array(); + // Find out the best records and list their IDs: + $result = $event->getTarget(); + foreach ($result->getRecords() as $record) { + $fields = $record->getRawData(); + + if (!isset($fields['merged_boolean'])) { + continue; + } + $localIds = $fields['local_ids_str_mv']; + $dedupId = $localIds[0]; + $priority = 99999; + $undefPriority = 99999; + // Find the document that matches the source priority best: + $dedupData = array(); + foreach ($localIds as $localId) { + $localPriority = null; + list($source, $localPart) = explode('.', $localId, 2); + if (!empty($buildingPriority)) { + if (isset($buildingPriority[$source])) { + $localPriority = -$buildingPriority[$source]; + } elseif (isset($datasourceConfig[$source]['institution'])) { + $institution = $datasourceConfig[$source]['institution']; + if (isset($buildingPriority[$institution])) { + $localPriority = -$buildingPriority[$institution]; + } + } + } + if (!isset($localPriority)) { + if (isset($sourcePriority[$source])) { + $localPriority = $sourcePriority[$source]; + } else { + $localPriority = ++$undefPriority; + } + } + if (isset($localPriority) && $localPriority < $priority) { + $dedupId = $localId; + $priority = $localPriority; + } + $dedupData[$source] = array( + 'id' => $localId, + 'priority' => isset($localPriority) ? $localPriority : 99999 + ); + } + $fields['dedup_id'] = $dedupId; + $idList[] = $dedupId; + + // Sort dedupData by priority: + uasort( + $dedupData, + function($a, $b) { + return $a['priority'] - $b['priority']; + } + ); + $fields['dedup_data'] = $dedupData; + $record->setRawData($fields); + } + if (empty($idList)) { + return; + } + + // Fetch records and assign them to the result: + $searchService = $this->serviceLocator->get('VuFind\Search'); + $localRecords = $this->backend->retrieveBatch($idList)->getRecords(); + foreach ($result->getRecords() as $record) { + $dedupRecordData = $record->getRawData(); + if (!isset($dedupRecordData['dedup_id'])) { + continue; + } + // Find the corresponding local record in the results: + $foundLocalRecord = null; + foreach ($localRecords as $localRecord) { + $localFields = $localRecord->getRawData(); + if ($localRecord->getUniqueID() == $dedupRecordData['dedup_id']) { + $foundLocalRecord = $localRecord; + break; + } + } + if (!$foundLocalRecord) { + continue; + } + + $localRecordData = $foundLocalRecord->getRawData(); + + // Copy dedup_data for the active data sources: + foreach ($dedupRecordData['dedup_data'] as $dedupDataKey => $dedupData) { + if (!$recordSources || isset($sourcePriority[$dedupDataKey])) { + $localRecordData['dedup_data'][$dedupDataKey] = $dedupData; + } + } + + // Copy fields from dedup record to local record + $localRecordData = $this->appendDedupRecordFields( + $localRecordData, + $dedupRecordData, + $recordSources, + $sourcePriority + ); + $record->setRawData($localRecordData); + } + } + + /** + * Append fields from dedup record to the selected local record + * + * @param array $localRecordData Local record data + * @param array $dedupRecordData Dedup record data + * @param string $recordSources List of active record sources, empty if all + * @param array $sourcePriority Array of source priorities keyed by source id + * + * @return array Local record data + */ + protected function appendDedupRecordFields($localRecordData, $dedupRecordData, + $recordSources, $sourcePriority + ) { + $localRecordData['local_ids_str_mv'] = $dedupRecordData['local_ids_str_mv']; + return $localRecordData; + } + + /** + * Function that determines the priority for sources + * + * @param object $recordSources Record sources defined in searches.ini + * + * @return array Array keyed by source with priority as the value + */ + protected function determineSourcePriority($recordSources) + { + return array_flip(explode(',', $recordSources)); + } + + /** + * Function that determines the priority for buildings + * + * @param object $params Query parameters + * + * @return array Array keyed by building with priority as the value + */ + protected function determineBuildingPriority($params) + { + $result = array(); + foreach ($params->get('fq') as $fq) { + if (strncmp($fq, 'building:', 9) == 0) { + if (preg_match( + '/^building:"?\d+\/([^\/]+?)\//', + $fq, + $matches + )) { + // Hierarchical facets; take only first level: + $result[] = $matches[1]; + } else { + $result[] = substr($fq, 12); + } + } + } + + array_unshift($result, ''); + $result = array_flip($result); + return $result; + } + +} diff --git a/themes/blueprint/templates/RecordDriver/SolrDefault/result-list.phtml b/themes/blueprint/templates/RecordDriver/SolrDefault/result-list.phtml index 8967ca6362d..b6d422505f2 100644 --- a/themes/blueprint/templates/RecordDriver/SolrDefault/result-list.phtml +++ b/themes/blueprint/templates/RecordDriver/SolrDefault/result-list.phtml @@ -66,6 +66,33 @@ } } ?> + + <? + /* Display information on duplicate records if available */ + $dedupData = $this->driver->getDedupData(); + if ($dedupData): ?> + <div class="dedupInformation"> + <? + $i = 0; + foreach ($dedupData as $source => $current) { + if (++$i == 1) { + ?><span class="currentSource"><a href="<?=$this->recordLink()->getUrl($this->driver)?>"><?=$this->transEsc("source_$source", array(), $source)?></a></span><? + } else { + if ($i == 2) { + ?> <span class="otherSources">(<?=$this->transEsc('Other Sources')?>: <? + } else { + ?>, <? + } + ?><a href="<?=$this->recordLink()->getUrl($current['id'])?>"><?=$this->transEsc("source_$source", array(), $source)?></a><? + } + } + if ($i > 1) { + ?>)</span><? + }?> + </div> + <? endif; ?> + + <div class="callnumAndLocation"> <? if ($this->driver->supportsAjaxStatus()): ?> <strong class="hideIfDetailed"><?=$this->transEsc('Call Number')?>:</strong> diff --git a/themes/bootprint/templates/RecordDriver/SolrDefault/result-list.phtml b/themes/bootprint/templates/RecordDriver/SolrDefault/result-list.phtml index 523f94b8cb7..8a8c27329b6 100644 --- a/themes/bootprint/templates/RecordDriver/SolrDefault/result-list.phtml +++ b/themes/bootprint/templates/RecordDriver/SolrDefault/result-list.phtml @@ -76,6 +76,31 @@ <br/> <? endif; ?> + <? + /* Display information on duplicate records if available */ + $dedupData = $this->driver->getDedupData(); + if ($dedupData): ?> + <div class="dedupInformation"> + <? + $i = 0; + foreach ($dedupData as $source => $current) { + if (++$i == 1) { + ?><span class="currentSource"><a href="<?=$this->recordLink()->getUrl($this->driver)?>"><?=$this->transEsc("source_$source", array(), $source)?></a></span><? + } else { + if ($i == 2) { + ?> <span class="otherSources">(<?=$this->transEsc('Other Sources')?>: <? + } else { + ?>, <? + } + ?><a href="<?=$this->recordLink()->getUrl($current['id'])?>"><?=$this->transEsc("source_$source", array(), $source)?></a><? + } + } + if ($i > 1) { + ?>)</span><? + }?> + </div> + <? endif; ?> + <div class="callnumAndLocation ajax-availability hide"> <? if ($this->driver->supportsAjaxStatus()): ?> <strong class="hideIfDetailed"><?=$this->transEsc('Call Number')?>:</strong> diff --git a/themes/bootstrap/templates/RecordDriver/SolrDefault/result-list.phtml b/themes/bootstrap/templates/RecordDriver/SolrDefault/result-list.phtml index 3aa95c29786..d9c8ae71fd8 100644 --- a/themes/bootstrap/templates/RecordDriver/SolrDefault/result-list.phtml +++ b/themes/bootstrap/templates/RecordDriver/SolrDefault/result-list.phtml @@ -76,6 +76,31 @@ <br/> <? endif; ?> + <? + /* Display information on duplicate records if available */ + $dedupData = $this->driver->getDedupData(); + if ($dedupData): ?> + <div class="dedupInformation"> + <? + $i = 0; + foreach ($dedupData as $source => $current) { + if (++$i == 1) { + ?><span class="currentSource"><a href="<?=$this->recordLink()->getUrl($this->driver)?>"><?=$this->transEsc("source_$source", array(), $source)?></a></span><? + } else { + if ($i == 2) { + ?> <span class="otherSources">(<?=$this->transEsc('Other Sources')?>: <? + } else { + ?>, <? + } + ?><a href="<?=$this->recordLink()->getUrl($current['id'])?>"><?=$this->transEsc("source_$source", array(), $source)?></a><? + } + } + if ($i > 1) { + ?>)</span><? + }?> + </div> + <? endif; ?> + <div class="callnumAndLocation ajax-availability hide"> <? if ($this->driver->supportsAjaxStatus()): ?> <strong class="hideIfDetailed"><?=$this->transEsc('Call Number')?>:</strong> -- GitLab