From 7892665e2bc79d58bb41924339f5e839b986b2d4 Mon Sep 17 00:00:00 2001 From: Demian Katz <demian.katz@villanova.edu> Date: Wed, 19 Aug 2015 13:27:13 -0400 Subject: [PATCH] Web crawler improvements. - Added support for switches on webcrawl.php - Don't waste time processing sitemaps with no URLs in them --- .../Controller/ImportController.php | 75 ++++++++++++------- 1 file changed, 49 insertions(+), 26 deletions(-) diff --git a/module/VuFindConsole/src/VuFindConsole/Controller/ImportController.php b/module/VuFindConsole/src/VuFindConsole/Controller/ImportController.php index c35b540e00f..1b6c89334d4 100644 --- a/module/VuFindConsole/src/VuFindConsole/Controller/ImportController.php +++ b/module/VuFindConsole/src/VuFindConsole/Controller/ImportController.php @@ -138,6 +138,16 @@ class ImportController extends AbstractBase */ public function webcrawlAction() { + // Parse switches: + $this->consoleOpts->addRules( + ['test-only' => 'Use test mode', 'index-s' => 'Solr index to use'] + ); + $testMode = $this->consoleOpts->getOption('test-only') ? true : false; + $index = $this->consoleOpts->getOption('index'); + if (empty($index)) { + $index = 'SolrWeb'; + } + $configLoader = $this->getServiceLocator()->get('VuFind\Config'); $crawlConfig = $configLoader->get('webcrawl'); @@ -152,23 +162,26 @@ class ImportController extends AbstractBase // Loop through sitemap URLs in the config file. foreach ($crawlConfig->Sitemaps->url as $current) { - $this->harvestSitemap($current, $verbose); + $this->harvestSitemap($current, $verbose, $index, $testMode); } - // Perform the delete of outdated records: - $solr = $this->getServiceLocator()->get('VuFind\Solr\Writer'); - if ($verbose) { - Console::writeLine("Deleting old records (prior to $startTime)..."); - } - $solr->deleteByQuery('SolrWeb', 'last_indexed:[* TO ' . $startTime . ']'); - if ($verbose) { - Console::writeLine('Committing...'); - } - $solr->commit('SolrWeb'); - if ($verbose) { - Console::writeLine('Optimizing...'); + // Skip Solr operations if we're in test mode. + if (!$testMode) { + $solr = $this->getServiceLocator()->get('VuFind\Solr\Writer'); + if ($verbose) { + Console::writeLine("Deleting old records (prior to $startTime)..."); + } + // Perform the delete of outdated records: + $solr->deleteByQuery($index, 'last_indexed:[* TO ' . $startTime . ']'); + if ($verbose) { + Console::writeLine('Committing...'); + } + $solr->commit($index); + if ($verbose) { + Console::writeLine('Optimizing...'); + } + $solr->optimize($index); } - $solr->optimize('SolrWeb'); } /** @@ -177,13 +190,16 @@ class ImportController extends AbstractBase * Process a sitemap URL, either harvesting its contents directly or recursively * reading in child sitemaps. * - * @param string $url URL of sitemap to read. - * @param bool $verbose Are we in verbose mode? + * @param string $url URL of sitemap to read. + * @param bool $verbose Are we in verbose mode? + * @param string $index Solr index to update + * @param bool $testMode Are we in test mode? * * @return bool True on success, false on error. */ - protected function harvestSitemap($url, $verbose = false) - { + protected function harvestSitemap($url, $verbose = false, $index = 'SolrWeb', + $testMode = false + ) { if ($verbose) { Console::writeLine("Harvesting $url..."); } @@ -198,19 +214,26 @@ class ImportController extends AbstractBase $results = isset($xml->sitemap) ? $xml->sitemap : []; foreach ($results as $current) { if (isset($current->loc)) { - if (!$this->harvestSitemap((string)$current->loc, $verbose)) { + $success = $this->harvestSitemap( + (string)$current->loc, $verbose, $index, $testMode + ); + if (!$success) { $retVal = false; } } } - - try { - $this->performImport($file, 'sitemap.properties', 'SolrWeb'); - } catch (\Exception $e) { - if ($verbose) { - Console::writeLine(get_class($e) . ': ' . $e->getMessage()); + // Only import the current sitemap if it contains URLs! + if (isset($xml->url)) { + try { + $this->performImport( + $file, 'sitemap.properties', $index, $testMode + ); + } catch (\Exception $e) { + if ($verbose) { + Console::writeLine(get_class($e) . ': ' . $e->getMessage()); + } + $retVal = false; } - $retVal = false; } } unlink($file); -- GitLab