Skip to content
Snippets Groups Projects
Commit 838e191f authored by Demian Katz's avatar Demian Katz
Browse files

Resolving VUFIND-737 (Incorrect spelling tokenization).

Added corresponding unit tests.
parent bb8eb4af
No related merge requests found
......@@ -440,44 +440,33 @@ abstract class Results implements ServiceLocatorAwareInterface
*/
public function spellingTokens($input)
{
// Blacklist of useless tokens:
$joins = array("AND", "OR", "NOT");
$paren = array("(" => "", ")" => "");
// Base of this algorithm comes straight from
// PHP doco examples & benighted at gmail dot com
// http://php.net/manual/en/function.strtok.php
// Strip out parentheses -- irrelevant for tokenization:
$paren = array("(" => " ", ")" => " ");
$input = trim(strtr($input, $paren));
// Base of this algorithm comes straight from PHP doc example by
// benighted at gmail dot com: http://php.net/manual/en/function.strtok.php
$tokens = array();
$token = strtok($input, ' ');
while ($token) {
// find bracketed tokens
if ($token{0}=='(') {
$token .= ' '.strtok(')').')';
}
$token = strtok($input, " \t");
do {
// find double quoted tokens
if ($token{0}=='"') {
if ($token{0}=='"' && substr($token, -1) != '"') {
$token .= ' '.strtok('"').'"';
}
// find single quoted tokens
if ($token{0}=="'") {
if ($token{0}=="'" && substr($token, -1) != "'") {
$token .= ' '.strtok("'")."'";
}
$tokens[] = $token;
$token = strtok(' ');
}
// Some cleaning of tokens that are just boolean joins
// and removal of brackets
$return = array();
foreach ($tokens as $token) {
// Ignore join
// skip boolean operators
if (!in_array($token, $joins)) {
// And strip parentheses
$final = trim(strtr($token, $paren));
if ($final != "") {
$return[] = $final;
}
$tokens[] = $token;
}
}
return $return;
} while ($token = strtok(" \t"));
return $tokens;
}
/**
......
<?php
/**
* Base Search Object Results Test
*
* PHP version 5
*
* Copyright (C) Villanova University 2010.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* @category VuFind2
* @package Tests
* @author Demian Katz <demian.katz@villanova.edu>
* @author Preetha Rao <vufind-tech@lists.sourceforge.net>
* @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License
* @link http://vufind.org/wiki/vufind2:unit_tests Wiki
*/
namespace VuFindTest\Search\Base;
/**
* Base Search Object Results Test
*
* @category VuFind2
* @package Tests
* @author Demian Katz <demian.katz@villanova.edu>
* @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License
* @link http://vufind.org/wiki/vufind2:unit_tests Wiki
*/
class ResultsTest extends \VuFindTest\Unit\TestCase
{
/**
* Test that spelling tokenization works correctly.
*
* @return void
*/
public function testSpellingTokenization()
{
// Use Solr results since base results is an abstract class.
$solr = $this->getSearchManager()->setSearchClassId('Solr')->getResults();
$this->assertEquals(array('single'), $solr->spellingTokens('single'));
$this->assertEquals(array('two', 'terms'), $solr->spellingTokens('two terms'));
$this->assertEquals(array('two', 'terms'), $solr->spellingTokens('two terms'));
$this->assertEquals(array('apples', 'oranges'), $solr->spellingTokens('apples OR oranges'));
$this->assertEquals(array('"word"'), $solr->spellingTokens('"word"'));
$this->assertEquals(array('"word"', 'second'), $solr->spellingTokens('"word" second'));
$this->assertEquals(array("'word'"), $solr->spellingTokens("'word'"));
$this->assertEquals(array("'word'", 'second'), $solr->spellingTokens("'word' second"));
$this->assertEquals(array('word'), $solr->spellingTokens('(word)'));
$this->assertEquals(array('word', 'second'), $solr->spellingTokens('(word) second'));
$this->assertEquals(array('apples', 'oranges', 'pears'), $solr->spellingTokens('(apples OR oranges) AND pears'));
$this->assertEquals(array('two', 'terms'), $solr->spellingTokens("two\tterms"));
$this->assertEquals(
array('"two words"', 'single', "'three word phrase'", 'single'),
$solr->spellingTokens('((("two words" OR single) NOT \'three word phrase\') AND single)')
);
}
}
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment