From ffb477250fd207e25e81a8325feefae4affc2cd0 Mon Sep 17 00:00:00 2001 From: Demian Katz <demian.katz@villanova.edu> Date: Thu, 17 Jan 2013 07:22:39 -0500 Subject: [PATCH] More spelling tokenization fixes related to VUFIND-737. --- .../VuFind/src/VuFind/Search/Base/Results.php | 18 +++++++++++------- .../unit-tests/src/Search/Base/ResultsTest.php | 12 ++++++++---- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/module/VuFind/src/VuFind/Search/Base/Results.php b/module/VuFind/src/VuFind/Search/Base/Results.php index f814d04f6c9..f5d4075d12f 100644 --- a/module/VuFind/src/VuFind/Search/Base/Results.php +++ b/module/VuFind/src/VuFind/Search/Base/Results.php @@ -451,21 +451,25 @@ abstract class Results implements ServiceLocatorAwareInterface // benighted at gmail dot com: http://php.net/manual/en/function.strtok.php $tokens = array(); $token = strtok($input, " \t"); - do { + while ($token !== false) { // find double quoted tokens - if ($token{0}=='"' && substr($token, -1) != '"') { + if (substr($token, 0, 1) == '"' && substr($token, -1) != '"') { $token .= ' '.strtok('"').'"'; } - // find single quoted tokens - if ($token{0}=="'" && substr($token, -1) != "'") { - $token .= ' '.strtok("'")."'"; - } // skip boolean operators if (!in_array($token, $joins)) { $tokens[] = $token; } - } while ($token = strtok(" \t")); + $token = strtok(" \t"); + } + // If the last token ends in a double quote but the input string does not, + // the tokenization process added the quote, which will break spelling + // replacements. We need to strip it back off again: + $last = count($tokens) > 0 ? $tokens[count($tokens) - 1] : null; + if ($last && substr($last, -1) == '"' && substr($input, -1) != '"') { + $tokens[count($tokens) - 1] = substr($last, 0, strlen($last) - 1); + } return $tokens; } diff --git a/module/VuFind/tests/unit-tests/src/Search/Base/ResultsTest.php b/module/VuFind/tests/unit-tests/src/Search/Base/ResultsTest.php index aa3bdd7d3b8..89be58ac202 100644 --- a/module/VuFind/tests/unit-tests/src/Search/Base/ResultsTest.php +++ b/module/VuFind/tests/unit-tests/src/Search/Base/ResultsTest.php @@ -55,15 +55,19 @@ class ResultsTest extends \VuFindTest\Unit\TestCase $this->assertEquals(array('apples', 'oranges'), $solr->spellingTokens('apples OR oranges')); $this->assertEquals(array('"word"'), $solr->spellingTokens('"word"')); $this->assertEquals(array('"word"', 'second'), $solr->spellingTokens('"word" second')); - $this->assertEquals(array("'word'"), $solr->spellingTokens("'word'")); - $this->assertEquals(array("'word'", 'second'), $solr->spellingTokens("'word' second")); + $this->assertEquals(array(), $solr->spellingTokens('')); + $this->assertEquals(array('0', 'is', 'zero'), $solr->spellingTokens('0 is zero')); + $this->assertEquals(array("'twas", 'successful'), $solr->spellingTokens("'twas successful")); $this->assertEquals(array('word'), $solr->spellingTokens('(word)')); $this->assertEquals(array('word', 'second'), $solr->spellingTokens('(word) second')); $this->assertEquals(array('apples', 'oranges', 'pears'), $solr->spellingTokens('(apples OR oranges) AND pears')); $this->assertEquals(array('two', 'terms'), $solr->spellingTokens("two\tterms")); $this->assertEquals( - array('"two words"', 'single', "'three word phrase'", 'single'), - $solr->spellingTokens('((("two words" OR single) NOT \'three word phrase\') AND single)') + array('"two words"', 'single', '"three word phrase"', 'single'), + $solr->spellingTokens('((("two words" OR single) NOT "three word phrase") AND single)') ); + $this->assertEquals(array('"unfinished phrase'), $solr->spellingTokens('"unfinished phrase')); + $this->assertEquals(array('"'), $solr->spellingTokens('"')); + $this->assertEquals(array('""'), $solr->spellingTokens('""')); } } \ No newline at end of file -- GitLab