Skip to content
Snippets Groups Projects
Commit 7892665e authored by Demian Katz's avatar Demian Katz
Browse files

Web crawler improvements.

- Added support for switches on webcrawl.php
- Don't waste time processing sitemaps with no URLs in them
parent 1249225b
No related merge requests found
...@@ -138,6 +138,16 @@ class ImportController extends AbstractBase ...@@ -138,6 +138,16 @@ class ImportController extends AbstractBase
*/ */
public function webcrawlAction() public function webcrawlAction()
{ {
// Parse switches:
$this->consoleOpts->addRules(
['test-only' => 'Use test mode', 'index-s' => 'Solr index to use']
);
$testMode = $this->consoleOpts->getOption('test-only') ? true : false;
$index = $this->consoleOpts->getOption('index');
if (empty($index)) {
$index = 'SolrWeb';
}
$configLoader = $this->getServiceLocator()->get('VuFind\Config'); $configLoader = $this->getServiceLocator()->get('VuFind\Config');
$crawlConfig = $configLoader->get('webcrawl'); $crawlConfig = $configLoader->get('webcrawl');
...@@ -152,23 +162,26 @@ class ImportController extends AbstractBase ...@@ -152,23 +162,26 @@ class ImportController extends AbstractBase
// Loop through sitemap URLs in the config file. // Loop through sitemap URLs in the config file.
foreach ($crawlConfig->Sitemaps->url as $current) { foreach ($crawlConfig->Sitemaps->url as $current) {
$this->harvestSitemap($current, $verbose); $this->harvestSitemap($current, $verbose, $index, $testMode);
} }
// Perform the delete of outdated records: // Skip Solr operations if we're in test mode.
$solr = $this->getServiceLocator()->get('VuFind\Solr\Writer'); if (!$testMode) {
if ($verbose) { $solr = $this->getServiceLocator()->get('VuFind\Solr\Writer');
Console::writeLine("Deleting old records (prior to $startTime)..."); if ($verbose) {
} Console::writeLine("Deleting old records (prior to $startTime)...");
$solr->deleteByQuery('SolrWeb', 'last_indexed:[* TO ' . $startTime . ']'); }
if ($verbose) { // Perform the delete of outdated records:
Console::writeLine('Committing...'); $solr->deleteByQuery($index, 'last_indexed:[* TO ' . $startTime . ']');
} if ($verbose) {
$solr->commit('SolrWeb'); Console::writeLine('Committing...');
if ($verbose) { }
Console::writeLine('Optimizing...'); $solr->commit($index);
if ($verbose) {
Console::writeLine('Optimizing...');
}
$solr->optimize($index);
} }
$solr->optimize('SolrWeb');
} }
/** /**
...@@ -177,13 +190,16 @@ class ImportController extends AbstractBase ...@@ -177,13 +190,16 @@ class ImportController extends AbstractBase
* Process a sitemap URL, either harvesting its contents directly or recursively * Process a sitemap URL, either harvesting its contents directly or recursively
* reading in child sitemaps. * reading in child sitemaps.
* *
* @param string $url URL of sitemap to read. * @param string $url URL of sitemap to read.
* @param bool $verbose Are we in verbose mode? * @param bool $verbose Are we in verbose mode?
* @param string $index Solr index to update
* @param bool $testMode Are we in test mode?
* *
* @return bool True on success, false on error. * @return bool True on success, false on error.
*/ */
protected function harvestSitemap($url, $verbose = false) protected function harvestSitemap($url, $verbose = false, $index = 'SolrWeb',
{ $testMode = false
) {
if ($verbose) { if ($verbose) {
Console::writeLine("Harvesting $url..."); Console::writeLine("Harvesting $url...");
} }
...@@ -198,19 +214,26 @@ class ImportController extends AbstractBase ...@@ -198,19 +214,26 @@ class ImportController extends AbstractBase
$results = isset($xml->sitemap) ? $xml->sitemap : []; $results = isset($xml->sitemap) ? $xml->sitemap : [];
foreach ($results as $current) { foreach ($results as $current) {
if (isset($current->loc)) { if (isset($current->loc)) {
if (!$this->harvestSitemap((string)$current->loc, $verbose)) { $success = $this->harvestSitemap(
(string)$current->loc, $verbose, $index, $testMode
);
if (!$success) {
$retVal = false; $retVal = false;
} }
} }
} }
// Only import the current sitemap if it contains URLs!
try { if (isset($xml->url)) {
$this->performImport($file, 'sitemap.properties', 'SolrWeb'); try {
} catch (\Exception $e) { $this->performImport(
if ($verbose) { $file, 'sitemap.properties', $index, $testMode
Console::writeLine(get_class($e) . ': ' . $e->getMessage()); );
} catch (\Exception $e) {
if ($verbose) {
Console::writeLine(get_class($e) . ': ' . $e->getMessage());
}
$retVal = false;
} }
$retVal = false;
} }
} }
unlink($file); unlink($file);
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment