From 838e191f6dc2d9e6ae85ecb54c582184af69a029 Mon Sep 17 00:00:00 2001
From: Demian Katz <demian.katz@villanova.edu>
Date: Wed, 16 Jan 2013 14:05:31 -0500
Subject: [PATCH] Resolving VUFIND-737 (Incorrect spelling tokenization). Added
 corresponding unit tests.

---
 .../VuFind/src/VuFind/Search/Base/Results.php | 43 +++++-------
 .../src/Search/Base/ResultsTest.php           | 69 +++++++++++++++++++
 2 files changed, 85 insertions(+), 27 deletions(-)
 create mode 100644 module/VuFind/tests/unit-tests/src/Search/Base/ResultsTest.php

diff --git a/module/VuFind/src/VuFind/Search/Base/Results.php b/module/VuFind/src/VuFind/Search/Base/Results.php
index ddb217c38ce..f814d04f6c9 100644
--- a/module/VuFind/src/VuFind/Search/Base/Results.php
+++ b/module/VuFind/src/VuFind/Search/Base/Results.php
@@ -440,44 +440,33 @@ abstract class Results implements ServiceLocatorAwareInterface
      */
     public function spellingTokens($input)
     {
+        // Blacklist of useless tokens:
         $joins = array("AND", "OR", "NOT");
-        $paren = array("(" => "", ")" => "");
 
-        // Base of this algorithm comes straight from
-        // PHP doco examples & benighted at gmail dot com
-        // http://php.net/manual/en/function.strtok.php
+        // Strip out parentheses -- irrelevant for tokenization:
+        $paren = array("(" => " ", ")" => " ");
+        $input = trim(strtr($input, $paren));
+
+        // Base of this algorithm comes straight from PHP doc example by
+        // benighted at gmail dot com: http://php.net/manual/en/function.strtok.php
         $tokens = array();
-        $token = strtok($input, ' ');
-        while ($token) {
-            // find bracketed tokens
-            if ($token{0}=='(') {
-                $token .= ' '.strtok(')').')';
-            }
+        $token = strtok($input, " \t");
+        do {
             // find double quoted tokens
-            if ($token{0}=='"') {
+            if ($token{0}=='"' && substr($token, -1) != '"') {
                 $token .= ' '.strtok('"').'"';
             }
             // find single quoted tokens
-            if ($token{0}=="'") {
+            if ($token{0}=="'" && substr($token, -1) != "'") {
                 $token .= ' '.strtok("'")."'";
             }
-            $tokens[] = $token;
-            $token = strtok(' ');
-        }
-        // Some cleaning of tokens that are just boolean joins
-        //  and removal of brackets
-        $return = array();
-        foreach ($tokens as $token) {
-            // Ignore join
+            // skip boolean operators
             if (!in_array($token, $joins)) {
-                // And strip parentheses
-                $final = trim(strtr($token, $paren));
-                if ($final != "") {
-                    $return[] = $final;
-                }
+                $tokens[] = $token;
             }
-        }
-        return $return;
+        } while ($token = strtok(" \t"));
+
+        return $tokens;
     }
 
     /**
diff --git a/module/VuFind/tests/unit-tests/src/Search/Base/ResultsTest.php b/module/VuFind/tests/unit-tests/src/Search/Base/ResultsTest.php
new file mode 100644
index 00000000000..aa3bdd7d3b8
--- /dev/null
+++ b/module/VuFind/tests/unit-tests/src/Search/Base/ResultsTest.php
@@ -0,0 +1,69 @@
+<?php
+/**
+ * Base Search Object Results Test
+ *
+ * PHP version 5
+ *
+ * Copyright (C) Villanova University 2010.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * @category VuFind2
+ * @package  Tests
+ * @author   Demian Katz <demian.katz@villanova.edu>
+ * @author   Preetha Rao <vufind-tech@lists.sourceforge.net>
+ * @license  http://opensource.org/licenses/gpl-2.0.php GNU General Public License
+ * @link     http://vufind.org/wiki/vufind2:unit_tests Wiki
+ */
+namespace VuFindTest\Search\Base;
+
+/**
+ * Base Search Object Results Test
+ *
+ * @category VuFind2
+ * @package  Tests
+ * @author   Demian Katz <demian.katz@villanova.edu>
+ * @license  http://opensource.org/licenses/gpl-2.0.php GNU General Public License
+ * @link     http://vufind.org/wiki/vufind2:unit_tests Wiki
+ */
+class ResultsTest extends \VuFindTest\Unit\TestCase
+{
+    /**
+     * Test that spelling tokenization works correctly.
+     *
+     * @return void
+     */
+    public function testSpellingTokenization()
+    {
+        // Use Solr results since base results is an abstract class.
+        $solr = $this->getSearchManager()->setSearchClassId('Solr')->getResults();
+
+        $this->assertEquals(array('single'), $solr->spellingTokens('single'));
+        $this->assertEquals(array('two', 'terms'), $solr->spellingTokens('two terms'));
+        $this->assertEquals(array('two', 'terms'), $solr->spellingTokens('two    terms'));
+        $this->assertEquals(array('apples', 'oranges'), $solr->spellingTokens('apples OR oranges'));
+        $this->assertEquals(array('"word"'), $solr->spellingTokens('"word"'));
+        $this->assertEquals(array('"word"', 'second'), $solr->spellingTokens('"word" second'));
+        $this->assertEquals(array("'word'"), $solr->spellingTokens("'word'"));
+        $this->assertEquals(array("'word'", 'second'), $solr->spellingTokens("'word' second"));
+        $this->assertEquals(array('word'), $solr->spellingTokens('(word)'));
+        $this->assertEquals(array('word', 'second'), $solr->spellingTokens('(word) second'));
+        $this->assertEquals(array('apples', 'oranges', 'pears'), $solr->spellingTokens('(apples OR oranges) AND pears'));
+        $this->assertEquals(array('two', 'terms'), $solr->spellingTokens("two\tterms"));
+        $this->assertEquals(
+            array('"two words"', 'single', "'three word phrase'", 'single'),
+            $solr->spellingTokens('((("two words" OR single) NOT \'three word phrase\') AND single)')
+        );
+    }
+}
\ No newline at end of file
-- 
GitLab