From 436987065b381df514d9f4f5381c059115fa7866 Mon Sep 17 00:00:00 2001 From: Demian Katz <demian.katz@villanova.edu> Date: Thu, 10 Sep 2020 12:40:47 -0400 Subject: [PATCH] Fix replaceTerm() word boundary / normalization bugs. (#1712) - Resolves VUFIND-1423. - Includes some additional test cases. --- .../src/VuFindSearch/Query/Query.php | 28 ++++++++------- .../src/VuFindTest/Query/QueryTest.php | 35 +++++++++++++++++++ 2 files changed, 51 insertions(+), 12 deletions(-) diff --git a/module/VuFindSearch/src/VuFindSearch/Query/Query.php b/module/VuFindSearch/src/VuFindSearch/Query/Query.php index f5e3d889631..8718d60f771 100644 --- a/module/VuFindSearch/src/VuFindSearch/Query/Query.php +++ b/module/VuFindSearch/src/VuFindSearch/Query/Query.php @@ -93,7 +93,15 @@ class Query extends AbstractQuery */ protected function normalizeText($text) { - return strtolower($this->stripDiacritics($text)); + // The input to normalizeText may be a Solr query with Boolean operators + // in it; we want to be careful not to turn this into something invalid. + $stripped = $this->stripDiacritics($text); + $booleans = ['AND', 'OR', 'NOT']; + $words = []; + foreach (preg_split('/\s+/', $stripped) as $word) { + $words[] = in_array($word, $booleans) ? $word : strtolower($word); + } + return implode(' ', $words); } /** @@ -223,18 +231,14 @@ class Query extends AbstractQuery $queryString = $normalize ? $this->getNormalizedString() : $this->queryString; - // If our "from" pattern contains non-word characters, we can't use word - // boundaries for matching. We want to try to use word boundaries when - // possible, however, to avoid the replacement from affecting unexpected - // parts of the search query. - if (!preg_match('/.*[^\w].*/', $from)) { - $pattern = "/\b$from\b/i"; - } else { - $pattern = "/$from/i"; + // Try to match within word boundaries to prevent the replacement from + // affecting unexpected parts of the search query; if that fails to change + // anything, try again with a less restricted regular expression. The fall- + // back is needed when $from contains punctuation characters such as commas. + $this->queryString = preg_replace("/\b$from\b/i", $to, $queryString); + if ($queryString === $this->queryString) { + $this->queryString = preg_replace("/$from/i", $to, $queryString); } - - // Perform the replacement: - $this->queryString = preg_replace($pattern, $to, $queryString); } /** diff --git a/module/VuFindSearch/tests/unit-tests/src/VuFindTest/Query/QueryTest.php b/module/VuFindSearch/tests/unit-tests/src/VuFindTest/Query/QueryTest.php index fb309de15c3..0b9144bbcd6 100644 --- a/module/VuFindSearch/tests/unit-tests/src/VuFindTest/Query/QueryTest.php +++ b/module/VuFindSearch/tests/unit-tests/src/VuFindTest/Query/QueryTest.php @@ -82,6 +82,41 @@ class QueryTest extends TestCase $q = new Query('test query we<(ird and/or'); $q->replaceTerm('and', 'not'); $this->assertEquals('test query we<(ird not/or', $q->getString()); + + $q = new Query('th\bbbt'); + $q->replaceTerm('th\bbbt', 'that'); + $this->assertEquals('that', $q->getString()); + } + + /** + * Test replacing a term containing punctuation; this exercises a special case + * in the code. + * + * @return void + */ + public function testReplacePunctuatedTerm() + { + $q = new Query('this, that'); + $q->replaceTerm('this,', 'the other,'); + $this->assertEquals('the other, that', $q->getString()); + } + + /** + * Test multiple replacements -- this simulates the scenario discussed in the + * VUFIND-1423 JIRA ticket. + * + * @return void + */ + public function testMultipleReplacements() + { + $q = new Query("color code"); + $q->replaceTerm('color code', '((color code) OR (color codes))', true); + $this->assertEquals('((color code) OR (color codes))', $q->getString()); + $q->replaceTerm('color code', '((color code) OR (color coded))', true); + $this->assertEquals( + '((((color code) OR (color coded))) OR (color codes))', + $q->getString() + ); } /** -- GitLab