diff --git a/config/vufind/sitemap.ini b/config/vufind/sitemap.ini index 9b9782327c220079ae9ec111ac92b0cb8d77381c..27dc2e4f67a7337d71be4cfde65f65c425d50090 100644 --- a/config/vufind/sitemap.ini +++ b/config/vufind/sitemap.ini @@ -43,11 +43,11 @@ fileLocation = /tmp index[] = "Solr,/Record/" ;index[] = "SolrAuth,/Authority/Record?id=" -; This setting controls how IDs are retrieved from the index. It may be 'terms' -; (the default, and the faster option) to use the terms component, or 'search' -; to retrieve results using a normal search (which is useful if you need to -; apply hidden filters to your results, or if you do not have terms enabled). -retrievalMode = terms +; This setting controls how IDs are retrieved from the index. It may be +; 'search' (the default, most compatible but slower method), or 'terms' (the +; faster option). Note that 'terms' method does not support hidden filters or +; other limiting options and requires that the index has terms enabled. +retrievalMode = search ; The SitemapIndex Section contains settings affecting the generation of ; a sitemap index file which groups multiple sitemap files. The sitemap diff --git a/module/VuFind/src/VuFind/Sitemap/Generator.php b/module/VuFind/src/VuFind/Sitemap/Generator.php index a84f14ae7ed8020a8341a6e5238f9c0a7e6f9fbb..a04068cdd9536140a5a2a80ff08f43d5e56b8d31 100644 --- a/module/VuFind/src/VuFind/Sitemap/Generator.php +++ b/module/VuFind/src/VuFind/Sitemap/Generator.php @@ -119,7 +119,7 @@ class Generator * * @var string */ - protected $retrievalMode = 'terms'; + protected $retrievalMode = 'search'; /** * Constructor @@ -242,36 +242,40 @@ class Generator */ protected function generateForBackend(Backend $backend, $recordUrl, $currentPage) { - $lastTerm = ''; - $count = 0; + // Starting offset varies depending on retrieval mode: + $currentOffset = ($this->retrievalMode === 'terms') ? '' : '*'; + $recordCount = 0; while (true) { // Get IDs and break out of the loop if we've run out: - $ids = $this->getIdsFromBackend($backend, $lastTerm, $count); - if (empty($ids)) { + $result = $this->getIdsFromBackend($backend, $currentOffset); + if (empty($result['ids'])) { break; } + $currentOffset = $result['nextOffset']; // Write the current entry: $smf = $this->getNewSitemap(); - foreach ($ids as $item) { + foreach ($result['ids'] as $item) { $loc = htmlspecialchars($recordUrl . urlencode($item)); if (strpos($loc, 'http') === false) { $loc = 'http://' . $loc; } $smf->addUrl($loc); - $lastTerm = $item; } $filename = $this->getFilenameForPage($currentPage); if (false === $smf->write($filename)) { throw new \Exception("Problem writing $filename."); } - // Update counters: - $count += $this->countPerPage; + // Update total record count: + $recordCount += count($result['ids']); + if ($this->verbose) { - Console::writeLine("Page $currentPage processed"); + Console::writeLine("Page $currentPage, $recordCount processed"); } + + // Update counter: $currentPage++; } return $currentPage; @@ -280,18 +284,16 @@ class Generator /** * Retrieve a batch of IDs. * - * @param Backend $backend Search backend - * @param string $lastTerm Last term retrieved - * @param int $offset Number of terms previously retrieved + * @param Backend $backend Search backend + * @param string $currentOffset String representing progress through set * * @return array */ - protected function getIdsFromBackend(Backend $backend, $lastTerm, $offset) + protected function getIdsFromBackend(Backend $backend, $currentOffset) { - if ($this->retrievalMode == 'terms') { - return $this->getIdsFromBackendUsingTerms($backend, $lastTerm); - } - return $this->getIdsFromBackendUsingSearch($backend, $offset); + $method = $this->retrievalMode == 'terms' + ? 'getIdsFromBackendUsingTerms' : 'getIdsFromBackendUsingCursorMark'; + return $this->$method($backend, $currentOffset); } /** @@ -307,19 +309,28 @@ class Generator $key = $backend->getConnector()->getUniqueKey(); $info = $backend->terms($key, $lastTerm, $this->countPerPage) ->getFieldTerms($key); - return null === $info ? [] : array_keys($info->toArray()); + $ids = null === $info ? [] : array_keys($info->toArray()); + $nextOffset = empty($ids) ? null : $ids[count($ids) - 1]; + return compact('ids', 'nextOffset'); } /** - * Retrieve a batch of IDs using regular search. + * Retrieve a batch of IDs using a cursorMark. * - * @param Backend $backend Search backend - * @param int $offset Number of terms previously retrieved + * @param Backend $backend Search backend + * @param string $cursorMark cursorMark * * @return array */ - protected function getIdsFromBackendUsingSearch(Backend $backend, $offset) - { + protected function getIdsFromBackendUsingCursorMark(Backend $backend, $cursorMark + ) { + // If the previous cursor mark matches the current one, we're finished! + static $prevCursorMark = ''; + if ($cursorMark === $prevCursorMark) { + return ['ids' => [], 'cursorMark' => $cursorMark]; + } + $prevCursorMark = $cursorMark; + $connector = $backend->getConnector(); $key = $connector->getUniqueKey(); $params = new ParamBag( @@ -327,20 +338,23 @@ class Generator 'q' => '*:*', 'fl' => $key, 'rows' => $this->countPerPage, - 'start' => $offset, + 'start' => 0, // Always 0 when using a cursorMark 'wt' => 'json', 'sort' => $key . ' asc', + // Override any default timeAllowed since it cannot be used with + // cursorMark + 'timeAllowed' => -1, + 'cursorMark' => $cursorMark ] ); $raw = $connector->search($params); $result = json_decode($raw); $ids = []; - if (isset($result->response->docs)) { - foreach ($result->response->docs as $doc) { - $ids[] = $doc->$key; - } + $nextOffset = $result->nextCursorMark; + foreach ($result->response->docs ?? [] as $doc) { + $ids[] = $doc->$key; } - return $ids; + return compact('ids', 'nextOffset'); } /** @@ -429,11 +443,7 @@ class Generator protected function getBaseSitemapIndexUrl() { // Pick the appropriate base URL based on the configuration files: - if (!isset($this->config->SitemapIndex->baseSitemapUrl) - || empty($this->config->SitemapIndex->baseSitemapUrl) - ) { - return $this->baseUrl; - } - return $this->config->SitemapIndex->baseSitemapUrl; + return empty($this->config->SitemapIndex->baseSitemapUrl) + ? $this->baseUrl : $this->config->SitemapIndex->baseSitemapUrl; } }