From 436987065b381df514d9f4f5381c059115fa7866 Mon Sep 17 00:00:00 2001
From: Demian Katz <demian.katz@villanova.edu>
Date: Thu, 10 Sep 2020 12:40:47 -0400
Subject: [PATCH] Fix replaceTerm() word boundary / normalization bugs. (#1712)

- Resolves VUFIND-1423.
- Includes some additional test cases.
---
 .../src/VuFindSearch/Query/Query.php          | 28 ++++++++-------
 .../src/VuFindTest/Query/QueryTest.php        | 35 +++++++++++++++++++
 2 files changed, 51 insertions(+), 12 deletions(-)

diff --git a/module/VuFindSearch/src/VuFindSearch/Query/Query.php b/module/VuFindSearch/src/VuFindSearch/Query/Query.php
index f5e3d889631..8718d60f771 100644
--- a/module/VuFindSearch/src/VuFindSearch/Query/Query.php
+++ b/module/VuFindSearch/src/VuFindSearch/Query/Query.php
@@ -93,7 +93,15 @@ class Query extends AbstractQuery
      */
     protected function normalizeText($text)
     {
-        return strtolower($this->stripDiacritics($text));
+        // The input to normalizeText may be a Solr query with Boolean operators
+        // in it; we want to be careful not to turn this into something invalid.
+        $stripped = $this->stripDiacritics($text);
+        $booleans = ['AND', 'OR', 'NOT'];
+        $words = [];
+        foreach (preg_split('/\s+/', $stripped) as $word) {
+            $words[] = in_array($word, $booleans) ? $word : strtolower($word);
+        }
+        return implode(' ', $words);
     }
 
     /**
@@ -223,18 +231,14 @@ class Query extends AbstractQuery
         $queryString = $normalize
             ? $this->getNormalizedString() : $this->queryString;
 
-        // If our "from" pattern contains non-word characters, we can't use word
-        // boundaries for matching.  We want to try to use word boundaries when
-        // possible, however, to avoid the replacement from affecting unexpected
-        // parts of the search query.
-        if (!preg_match('/.*[^\w].*/', $from)) {
-            $pattern = "/\b$from\b/i";
-        } else {
-            $pattern = "/$from/i";
+        // Try to match within word boundaries to prevent the replacement from
+        // affecting unexpected parts of the search query; if that fails to change
+        // anything, try again with a less restricted regular expression. The fall-
+        // back is needed when $from contains punctuation characters such as commas.
+        $this->queryString = preg_replace("/\b$from\b/i", $to, $queryString);
+        if ($queryString === $this->queryString) {
+            $this->queryString = preg_replace("/$from/i", $to, $queryString);
         }
-
-        // Perform the replacement:
-        $this->queryString = preg_replace($pattern, $to, $queryString);
     }
 
     /**
diff --git a/module/VuFindSearch/tests/unit-tests/src/VuFindTest/Query/QueryTest.php b/module/VuFindSearch/tests/unit-tests/src/VuFindTest/Query/QueryTest.php
index fb309de15c3..0b9144bbcd6 100644
--- a/module/VuFindSearch/tests/unit-tests/src/VuFindTest/Query/QueryTest.php
+++ b/module/VuFindSearch/tests/unit-tests/src/VuFindTest/Query/QueryTest.php
@@ -82,6 +82,41 @@ class QueryTest extends TestCase
         $q = new Query('test query we<(ird and/or');
         $q->replaceTerm('and', 'not');
         $this->assertEquals('test query we<(ird not/or', $q->getString());
+
+        $q = new Query('th\bbbt');
+        $q->replaceTerm('th\bbbt', 'that');
+        $this->assertEquals('that', $q->getString());
+    }
+
+    /**
+     * Test replacing a term containing punctuation; this exercises a special case
+     * in the code.
+     *
+     * @return void
+     */
+    public function testReplacePunctuatedTerm()
+    {
+        $q = new Query('this, that');
+        $q->replaceTerm('this,', 'the other,');
+        $this->assertEquals('the other, that', $q->getString());
+    }
+
+    /**
+     * Test multiple replacements -- this simulates the scenario discussed in the
+     * VUFIND-1423 JIRA ticket.
+     *
+     * @return void
+     */
+    public function testMultipleReplacements()
+    {
+        $q = new Query("color code");
+        $q->replaceTerm('color code', '((color code) OR (color codes))', true);
+        $this->assertEquals('((color code) OR (color codes))', $q->getString());
+        $q->replaceTerm('color code', '((color code) OR (color coded))', true);
+        $this->assertEquals(
+            '((((color code) OR (color coded))) OR (color codes))',
+            $q->getString()
+        );
     }
 
     /**
-- 
GitLab