From 838e191f6dc2d9e6ae85ecb54c582184af69a029 Mon Sep 17 00:00:00 2001 From: Demian Katz <demian.katz@villanova.edu> Date: Wed, 16 Jan 2013 14:05:31 -0500 Subject: [PATCH] Resolving VUFIND-737 (Incorrect spelling tokenization). Added corresponding unit tests. --- .../VuFind/src/VuFind/Search/Base/Results.php | 43 +++++------- .../src/Search/Base/ResultsTest.php | 69 +++++++++++++++++++ 2 files changed, 85 insertions(+), 27 deletions(-) create mode 100644 module/VuFind/tests/unit-tests/src/Search/Base/ResultsTest.php diff --git a/module/VuFind/src/VuFind/Search/Base/Results.php b/module/VuFind/src/VuFind/Search/Base/Results.php index ddb217c38ce..f814d04f6c9 100644 --- a/module/VuFind/src/VuFind/Search/Base/Results.php +++ b/module/VuFind/src/VuFind/Search/Base/Results.php @@ -440,44 +440,33 @@ abstract class Results implements ServiceLocatorAwareInterface */ public function spellingTokens($input) { + // Blacklist of useless tokens: $joins = array("AND", "OR", "NOT"); - $paren = array("(" => "", ")" => ""); - // Base of this algorithm comes straight from - // PHP doco examples & benighted at gmail dot com - // http://php.net/manual/en/function.strtok.php + // Strip out parentheses -- irrelevant for tokenization: + $paren = array("(" => " ", ")" => " "); + $input = trim(strtr($input, $paren)); + + // Base of this algorithm comes straight from PHP doc example by + // benighted at gmail dot com: http://php.net/manual/en/function.strtok.php $tokens = array(); - $token = strtok($input, ' '); - while ($token) { - // find bracketed tokens - if ($token{0}=='(') { - $token .= ' '.strtok(')').')'; - } + $token = strtok($input, " \t"); + do { // find double quoted tokens - if ($token{0}=='"') { + if ($token{0}=='"' && substr($token, -1) != '"') { $token .= ' '.strtok('"').'"'; } // find single quoted tokens - if ($token{0}=="'") { + if ($token{0}=="'" && substr($token, -1) != "'") { $token .= ' '.strtok("'")."'"; } - $tokens[] = $token; - $token = strtok(' '); - } - // Some cleaning of tokens that are just boolean joins - // and removal of brackets - $return = array(); - foreach ($tokens as $token) { - // Ignore join + // skip boolean operators if (!in_array($token, $joins)) { - // And strip parentheses - $final = trim(strtr($token, $paren)); - if ($final != "") { - $return[] = $final; - } + $tokens[] = $token; } - } - return $return; + } while ($token = strtok(" \t")); + + return $tokens; } /** diff --git a/module/VuFind/tests/unit-tests/src/Search/Base/ResultsTest.php b/module/VuFind/tests/unit-tests/src/Search/Base/ResultsTest.php new file mode 100644 index 00000000000..aa3bdd7d3b8 --- /dev/null +++ b/module/VuFind/tests/unit-tests/src/Search/Base/ResultsTest.php @@ -0,0 +1,69 @@ +<?php +/** + * Base Search Object Results Test + * + * PHP version 5 + * + * Copyright (C) Villanova University 2010. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * @category VuFind2 + * @package Tests + * @author Demian Katz <demian.katz@villanova.edu> + * @author Preetha Rao <vufind-tech@lists.sourceforge.net> + * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License + * @link http://vufind.org/wiki/vufind2:unit_tests Wiki + */ +namespace VuFindTest\Search\Base; + +/** + * Base Search Object Results Test + * + * @category VuFind2 + * @package Tests + * @author Demian Katz <demian.katz@villanova.edu> + * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License + * @link http://vufind.org/wiki/vufind2:unit_tests Wiki + */ +class ResultsTest extends \VuFindTest\Unit\TestCase +{ + /** + * Test that spelling tokenization works correctly. + * + * @return void + */ + public function testSpellingTokenization() + { + // Use Solr results since base results is an abstract class. + $solr = $this->getSearchManager()->setSearchClassId('Solr')->getResults(); + + $this->assertEquals(array('single'), $solr->spellingTokens('single')); + $this->assertEquals(array('two', 'terms'), $solr->spellingTokens('two terms')); + $this->assertEquals(array('two', 'terms'), $solr->spellingTokens('two terms')); + $this->assertEquals(array('apples', 'oranges'), $solr->spellingTokens('apples OR oranges')); + $this->assertEquals(array('"word"'), $solr->spellingTokens('"word"')); + $this->assertEquals(array('"word"', 'second'), $solr->spellingTokens('"word" second')); + $this->assertEquals(array("'word'"), $solr->spellingTokens("'word'")); + $this->assertEquals(array("'word'", 'second'), $solr->spellingTokens("'word' second")); + $this->assertEquals(array('word'), $solr->spellingTokens('(word)')); + $this->assertEquals(array('word', 'second'), $solr->spellingTokens('(word) second')); + $this->assertEquals(array('apples', 'oranges', 'pears'), $solr->spellingTokens('(apples OR oranges) AND pears')); + $this->assertEquals(array('two', 'terms'), $solr->spellingTokens("two\tterms")); + $this->assertEquals( + array('"two words"', 'single', "'three word phrase'", 'single'), + $solr->spellingTokens('((("two words" OR single) NOT \'three word phrase\') AND single)') + ); + } +} \ No newline at end of file -- GitLab