From f8ff42dfa76c61e522ca8eca958b7bf73c52f96e Mon Sep 17 00:00:00 2001
From: Ere Maijala <ere.maijala@helsinki.fi>
Date: Wed, 6 Jul 2016 11:21:29 -0400
Subject: [PATCH] Added method to strip Lucene syntax for spelling queries -
 Resolves VUFIND-1184.

---
 .../Backend/Solr/LuceneSyntaxHelper.php       | 68 +++++++++++++++++++
 .../Backend/Solr/QueryBuilder.php             |  7 +-
 .../Backend/Solr/LuceneSyntaxHelperTest.php   | 34 ++++++++++
 3 files changed, 107 insertions(+), 2 deletions(-)

diff --git a/module/VuFindSearch/src/VuFindSearch/Backend/Solr/LuceneSyntaxHelper.php b/module/VuFindSearch/src/VuFindSearch/Backend/Solr/LuceneSyntaxHelper.php
index b8df9a3caa9..a52c610915b 100644
--- a/module/VuFindSearch/src/VuFindSearch/Backend/Solr/LuceneSyntaxHelper.php
+++ b/module/VuFindSearch/src/VuFindSearch/Backend/Solr/LuceneSyntaxHelper.php
@@ -6,6 +6,7 @@
  * PHP version 5
  *
  * Copyright (C) Villanova University 2010.
+ * Copyright (C) The National Library of Finland 2016.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2,
@@ -25,6 +26,7 @@
  * @author   Andrew S. Nagy <vufind-tech@lists.sourceforge.net>
  * @author   David Maus <maus@hab.de>
  * @author   Demian Katz <demian.katz@villanova.edu>
+ * @author   Ere Maijala <ere.maijala@helsinki.fi>
  * @license  http://opensource.org/licenses/gpl-2.0.php GNU General Public License
  * @link     https://vufind.org
  */
@@ -38,6 +40,7 @@ namespace VuFindSearch\Backend\Solr;
  * @author   Andrew S. Nagy <vufind-tech@lists.sourceforge.net>
  * @author   David Maus <maus@hab.de>
  * @author   Demian Katz <demian.katz@villanova.edu>
+ * @author   Ere Maijala <ere.maijala@helsinki.fi>
  * @license  http://opensource.org/licenses/gpl-2.0.php GNU General Public License
  * @link     https://vufind.org
  */
@@ -267,6 +270,71 @@ class LuceneSyntaxHelper
         return trim(preg_replace_callback($regs, $callback, $string));
     }
 
+    /**
+     * Extract search terms from a query string for spell checking.
+     *
+     * This will only handle the most often used simple cases.
+     *
+     * @param string $query Query string
+     *
+     * @return string
+     */
+    public function extractSearchTerms($query)
+    {
+        $result = [];
+        $inQuotes = false;
+        $collected = '';
+        $discardParens = 0;
+        // Discard local parameters
+        $query = preg_replace('/\{!.+?\}/', '', $query);
+        // Discard fuzziness and proximity indicators
+        $query = preg_replace('/\~[^\s]*/', '', $query);
+        $query = preg_replace('/\^[^\s]*/', '', $query);
+        $lastCh = '';
+        foreach (str_split($query) as $ch) {
+            // Handle quotes (everything in quotes is considered part of search
+            // terms)
+            if ($ch == '"' && $lastCh != '\\') {
+                $inQuotes = !$inQuotes;
+            }
+            if (!$inQuotes) {
+                // Discard closing parenthesis for previously discarded opening ones
+                // to keep balance
+                if ($ch == ')' && $discardParens > 0) {
+                    --$discardParens;
+                    continue;
+                }
+                // Flush to result array on word break
+                if ($ch == ' ' && $collected !== '') {
+                    $result[] = $collected;
+                    $collected = '';
+                    continue;
+                }
+                // If we encounter ':', discard preceding string as it's a field name
+                if ($ch == ':') {
+                    // Take into account any opening parenthesis we discard here
+                    $discardParens += substr_count($collected, '(');
+                    $collected = '';
+                    continue;
+                }
+            }
+            $collected .= $ch;
+            $lastCh = $ch;
+        }
+        // Flush final collected string
+        if ($collected !== '') {
+            $result[] = $collected;
+        }
+        // Discard any preceding pluses or minuses
+        $result = array_map(
+            function ($s) {
+                return ltrim($s, '+-');
+            },
+            $result
+        );
+        return implode(' ', $result);
+    }
+
     /**
      * Are there any case-sensitive Boolean operators configured?
      *
diff --git a/module/VuFindSearch/src/VuFindSearch/Backend/Solr/QueryBuilder.php b/module/VuFindSearch/src/VuFindSearch/Backend/Solr/QueryBuilder.php
index 29f228bad3b..1e1bace701d 100644
--- a/module/VuFindSearch/src/VuFindSearch/Backend/Solr/QueryBuilder.php
+++ b/module/VuFindSearch/src/VuFindSearch/Backend/Solr/QueryBuilder.php
@@ -120,10 +120,13 @@ class QueryBuilder implements QueryBuilderInterface
     {
         $params = new ParamBag();
 
-        // Add spelling query if applicable -- note that we mus set this up before
+        // Add spelling query if applicable -- note that we must set this up before
         // we process the main query in order to avoid unwanted extra syntax:
         if ($this->createSpellingQuery) {
-            $params->set('spellcheck.q', $query->getAllTerms());
+            $params->set(
+                'spellcheck.q',
+                $this->getLuceneHelper()->extractSearchTerms($query->getAllTerms())
+            );
         }
 
         if ($query instanceof QueryGroup) {
diff --git a/module/VuFindSearch/tests/unit-tests/src/VuFindTest/Backend/Solr/LuceneSyntaxHelperTest.php b/module/VuFindSearch/tests/unit-tests/src/VuFindTest/Backend/Solr/LuceneSyntaxHelperTest.php
index 23fab9e15b9..f25106da006 100644
--- a/module/VuFindSearch/tests/unit-tests/src/VuFindTest/Backend/Solr/LuceneSyntaxHelperTest.php
+++ b/module/VuFindSearch/tests/unit-tests/src/VuFindTest/Backend/Solr/LuceneSyntaxHelperTest.php
@@ -342,4 +342,38 @@ class LuceneSyntaxHelperTest extends \VuFindTest\Unit\TestCase
             $expected, $lh->normalizeSearchString($input)
         );
     }
+
+    /**
+     * Test search term extraction
+     *
+     * @return void
+     */
+    public function testExtractSearchTerms()
+    {
+        $lh = new LuceneSyntaxHelper(false, false);
+        $tests = [
+            'keyword' => 'keyword',
+            'two keywords' => 'two keywords',
+            'index:keyword' => 'keyword',
+            'index:keyword anotherkeyword' => 'keyword anotherkeyword',
+            'index:keyword anotherindex:anotherkeyword' => 'keyword anotherkeyword',
+            '(index:keyword)' => 'keyword',
+            'index:(keyword1 keyword2)' => '(keyword1 keyword2)',
+            '{!local params}keyword' => 'keyword',
+            'keyword~' => 'keyword',
+            'keyword~0.8' => 'keyword',
+            'keyword keyword2^20' => 'keyword keyword2',
+            '"keyword keyword2 keyword3"~2' => '"keyword keyword2 keyword3"',
+            '"kw1 kw2 kw3"~2 kw4^200' => '"kw1 kw2 kw3" kw4',
+            '+keyword -keyword2^20' => 'keyword keyword2',
+            'index:+keyword index2:-keyword2^20' => 'keyword keyword2',
+            'index:[start TO end]' => '[start TO end]',
+            'index:{start TO end}' => '{start TO end}',
+            'es\\"caped field:test' => 'es\\"caped test'
+        ];
+        foreach ($tests as $input => $expected)
+        $this->assertEquals(
+            $expected, $lh->extractSearchTerms($input)
+        );
+    }
 }
-- 
GitLab