From 7892665e2bc79d58bb41924339f5e839b986b2d4 Mon Sep 17 00:00:00 2001
From: Demian Katz <demian.katz@villanova.edu>
Date: Wed, 19 Aug 2015 13:27:13 -0400
Subject: [PATCH] Web crawler improvements. - Added support for switches on
 webcrawl.php - Don't waste time processing sitemaps with no URLs in them

---
 .../Controller/ImportController.php           | 75 ++++++++++++-------
 1 file changed, 49 insertions(+), 26 deletions(-)

diff --git a/module/VuFindConsole/src/VuFindConsole/Controller/ImportController.php b/module/VuFindConsole/src/VuFindConsole/Controller/ImportController.php
index c35b540e00f..1b6c89334d4 100644
--- a/module/VuFindConsole/src/VuFindConsole/Controller/ImportController.php
+++ b/module/VuFindConsole/src/VuFindConsole/Controller/ImportController.php
@@ -138,6 +138,16 @@ class ImportController extends AbstractBase
      */
     public function webcrawlAction()
     {
+        // Parse switches:
+        $this->consoleOpts->addRules(
+            ['test-only' => 'Use test mode', 'index-s' => 'Solr index to use']
+        );
+        $testMode = $this->consoleOpts->getOption('test-only') ? true : false;
+        $index = $this->consoleOpts->getOption('index');
+        if (empty($index)) {
+            $index = 'SolrWeb';
+        }
+
         $configLoader = $this->getServiceLocator()->get('VuFind\Config');
         $crawlConfig = $configLoader->get('webcrawl');
 
@@ -152,23 +162,26 @@ class ImportController extends AbstractBase
 
         // Loop through sitemap URLs in the config file.
         foreach ($crawlConfig->Sitemaps->url as $current) {
-            $this->harvestSitemap($current, $verbose);
+            $this->harvestSitemap($current, $verbose, $index, $testMode);
         }
 
-        // Perform the delete of outdated records:
-        $solr = $this->getServiceLocator()->get('VuFind\Solr\Writer');
-        if ($verbose) {
-            Console::writeLine("Deleting old records (prior to $startTime)...");
-        }
-        $solr->deleteByQuery('SolrWeb', 'last_indexed:[* TO ' . $startTime . ']');
-        if ($verbose) {
-            Console::writeLine('Committing...');
-        }
-        $solr->commit('SolrWeb');
-        if ($verbose) {
-            Console::writeLine('Optimizing...');
+        // Skip Solr operations if we're in test mode.
+        if (!$testMode) {
+            $solr = $this->getServiceLocator()->get('VuFind\Solr\Writer');
+            if ($verbose) {
+                Console::writeLine("Deleting old records (prior to $startTime)...");
+            }
+            // Perform the delete of outdated records:
+            $solr->deleteByQuery($index, 'last_indexed:[* TO ' . $startTime . ']');
+            if ($verbose) {
+                Console::writeLine('Committing...');
+            }
+            $solr->commit($index);
+            if ($verbose) {
+                Console::writeLine('Optimizing...');
+            }
+            $solr->optimize($index);
         }
-        $solr->optimize('SolrWeb');
     }
 
     /**
@@ -177,13 +190,16 @@ class ImportController extends AbstractBase
      * Process a sitemap URL, either harvesting its contents directly or recursively
      * reading in child sitemaps.
      *
-     * @param string $url     URL of sitemap to read.
-     * @param bool   $verbose Are we in verbose mode?
+     * @param string $url      URL of sitemap to read.
+     * @param bool   $verbose  Are we in verbose mode?
+     * @param string $index    Solr index to update
+     * @param bool   $testMode Are we in test mode?
      *
      * @return bool       True on success, false on error.
      */
-    protected function harvestSitemap($url, $verbose = false)
-    {
+    protected function harvestSitemap($url, $verbose = false, $index = 'SolrWeb',
+        $testMode = false
+    ) {
         if ($verbose) {
             Console::writeLine("Harvesting $url...");
         }
@@ -198,19 +214,26 @@ class ImportController extends AbstractBase
             $results = isset($xml->sitemap) ? $xml->sitemap : [];
             foreach ($results as $current) {
                 if (isset($current->loc)) {
-                    if (!$this->harvestSitemap((string)$current->loc, $verbose)) {
+                    $success = $this->harvestSitemap(
+                        (string)$current->loc, $verbose, $index, $testMode
+                    );
+                    if (!$success) {
                         $retVal = false;
                     }
                 }
             }
-
-            try {
-                $this->performImport($file, 'sitemap.properties', 'SolrWeb');
-            } catch (\Exception $e) {
-                if ($verbose) {
-                    Console::writeLine(get_class($e) . ': ' . $e->getMessage());
+            // Only import the current sitemap if it contains URLs!
+            if (isset($xml->url)) {
+                try {
+                    $this->performImport(
+                        $file, 'sitemap.properties', $index, $testMode
+                    );
+                } catch (\Exception $e) {
+                    if ($verbose) {
+                        Console::writeLine(get_class($e) . ': ' . $e->getMessage());
+                    }
+                    $retVal = false;
                 }
-                $retVal = false;
             }
         }
         unlink($file);
-- 
GitLab