Skip to content
Snippets Groups Projects
Commit f2de9167 authored by Ere Maijala's avatar Ere Maijala Committed by Demian Katz
Browse files

Sitemap generation improvements (#1253)

- Use cursorMark for fetching IDs in sitemap generator search mode (faster for large indexes).
- Simplify code to better separate retrieval mode-specific logic.
- Change default retrieval mode to "search" since "terms" may not behave as expected in some situations (e.g. when hidden filters are applied).
parent 655b4270
No related merge requests found
......@@ -43,11 +43,11 @@ fileLocation = /tmp
index[] = "Solr,/Record/"
;index[] = "SolrAuth,/Authority/Record?id="
; This setting controls how IDs are retrieved from the index. It may be 'terms'
; (the default, and the faster option) to use the terms component, or 'search'
; to retrieve results using a normal search (which is useful if you need to
; apply hidden filters to your results, or if you do not have terms enabled).
retrievalMode = terms
; This setting controls how IDs are retrieved from the index. It may be
; 'search' (the default, most compatible but slower method), or 'terms' (the
; faster option). Note that 'terms' method does not support hidden filters or
; other limiting options and requires that the index has terms enabled.
retrievalMode = search
; The SitemapIndex Section contains settings affecting the generation of
; a sitemap index file which groups multiple sitemap files. The sitemap
......
......@@ -119,7 +119,7 @@ class Generator
*
* @var string
*/
protected $retrievalMode = 'terms';
protected $retrievalMode = 'search';
/**
* Constructor
......@@ -242,36 +242,40 @@ class Generator
*/
protected function generateForBackend(Backend $backend, $recordUrl, $currentPage)
{
$lastTerm = '';
$count = 0;
// Starting offset varies depending on retrieval mode:
$currentOffset = ($this->retrievalMode === 'terms') ? '' : '*';
$recordCount = 0;
while (true) {
// Get IDs and break out of the loop if we've run out:
$ids = $this->getIdsFromBackend($backend, $lastTerm, $count);
if (empty($ids)) {
$result = $this->getIdsFromBackend($backend, $currentOffset);
if (empty($result['ids'])) {
break;
}
$currentOffset = $result['nextOffset'];
// Write the current entry:
$smf = $this->getNewSitemap();
foreach ($ids as $item) {
foreach ($result['ids'] as $item) {
$loc = htmlspecialchars($recordUrl . urlencode($item));
if (strpos($loc, 'http') === false) {
$loc = 'http://' . $loc;
}
$smf->addUrl($loc);
$lastTerm = $item;
}
$filename = $this->getFilenameForPage($currentPage);
if (false === $smf->write($filename)) {
throw new \Exception("Problem writing $filename.");
}
// Update counters:
$count += $this->countPerPage;
// Update total record count:
$recordCount += count($result['ids']);
if ($this->verbose) {
Console::writeLine("Page $currentPage processed");
Console::writeLine("Page $currentPage, $recordCount processed");
}
// Update counter:
$currentPage++;
}
return $currentPage;
......@@ -280,18 +284,16 @@ class Generator
/**
* Retrieve a batch of IDs.
*
* @param Backend $backend Search backend
* @param string $lastTerm Last term retrieved
* @param int $offset Number of terms previously retrieved
* @param Backend $backend Search backend
* @param string $currentOffset String representing progress through set
*
* @return array
*/
protected function getIdsFromBackend(Backend $backend, $lastTerm, $offset)
protected function getIdsFromBackend(Backend $backend, $currentOffset)
{
if ($this->retrievalMode == 'terms') {
return $this->getIdsFromBackendUsingTerms($backend, $lastTerm);
}
return $this->getIdsFromBackendUsingSearch($backend, $offset);
$method = $this->retrievalMode == 'terms'
? 'getIdsFromBackendUsingTerms' : 'getIdsFromBackendUsingCursorMark';
return $this->$method($backend, $currentOffset);
}
/**
......@@ -307,19 +309,28 @@ class Generator
$key = $backend->getConnector()->getUniqueKey();
$info = $backend->terms($key, $lastTerm, $this->countPerPage)
->getFieldTerms($key);
return null === $info ? [] : array_keys($info->toArray());
$ids = null === $info ? [] : array_keys($info->toArray());
$nextOffset = empty($ids) ? null : $ids[count($ids) - 1];
return compact('ids', 'nextOffset');
}
/**
* Retrieve a batch of IDs using regular search.
* Retrieve a batch of IDs using a cursorMark.
*
* @param Backend $backend Search backend
* @param int $offset Number of terms previously retrieved
* @param Backend $backend Search backend
* @param string $cursorMark cursorMark
*
* @return array
*/
protected function getIdsFromBackendUsingSearch(Backend $backend, $offset)
{
protected function getIdsFromBackendUsingCursorMark(Backend $backend, $cursorMark
) {
// If the previous cursor mark matches the current one, we're finished!
static $prevCursorMark = '';
if ($cursorMark === $prevCursorMark) {
return ['ids' => [], 'cursorMark' => $cursorMark];
}
$prevCursorMark = $cursorMark;
$connector = $backend->getConnector();
$key = $connector->getUniqueKey();
$params = new ParamBag(
......@@ -327,20 +338,23 @@ class Generator
'q' => '*:*',
'fl' => $key,
'rows' => $this->countPerPage,
'start' => $offset,
'start' => 0, // Always 0 when using a cursorMark
'wt' => 'json',
'sort' => $key . ' asc',
// Override any default timeAllowed since it cannot be used with
// cursorMark
'timeAllowed' => -1,
'cursorMark' => $cursorMark
]
);
$raw = $connector->search($params);
$result = json_decode($raw);
$ids = [];
if (isset($result->response->docs)) {
foreach ($result->response->docs as $doc) {
$ids[] = $doc->$key;
}
$nextOffset = $result->nextCursorMark;
foreach ($result->response->docs ?? [] as $doc) {
$ids[] = $doc->$key;
}
return $ids;
return compact('ids', 'nextOffset');
}
/**
......@@ -429,11 +443,7 @@ class Generator
protected function getBaseSitemapIndexUrl()
{
// Pick the appropriate base URL based on the configuration files:
if (!isset($this->config->SitemapIndex->baseSitemapUrl)
|| empty($this->config->SitemapIndex->baseSitemapUrl)
) {
return $this->baseUrl;
}
return $this->config->SitemapIndex->baseSitemapUrl;
return empty($this->config->SitemapIndex->baseSitemapUrl)
? $this->baseUrl : $this->config->SitemapIndex->baseSitemapUrl;
}
}
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment