diff --git a/module/VuFindSearch/src/VuFindSearch/Backend/Solr/LuceneSyntaxHelper.php b/module/VuFindSearch/src/VuFindSearch/Backend/Solr/LuceneSyntaxHelper.php index b8df9a3caa90a923d7a23d450d005d43a7d32cdc..a52c610915bfe09fad421a1cec5e459f7afd4bd4 100644 --- a/module/VuFindSearch/src/VuFindSearch/Backend/Solr/LuceneSyntaxHelper.php +++ b/module/VuFindSearch/src/VuFindSearch/Backend/Solr/LuceneSyntaxHelper.php @@ -6,6 +6,7 @@ * PHP version 5 * * Copyright (C) Villanova University 2010. + * Copyright (C) The National Library of Finland 2016. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2, @@ -25,6 +26,7 @@ * @author Andrew S. Nagy <vufind-tech@lists.sourceforge.net> * @author David Maus <maus@hab.de> * @author Demian Katz <demian.katz@villanova.edu> + * @author Ere Maijala <ere.maijala@helsinki.fi> * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License * @link https://vufind.org */ @@ -38,6 +40,7 @@ namespace VuFindSearch\Backend\Solr; * @author Andrew S. Nagy <vufind-tech@lists.sourceforge.net> * @author David Maus <maus@hab.de> * @author Demian Katz <demian.katz@villanova.edu> + * @author Ere Maijala <ere.maijala@helsinki.fi> * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License * @link https://vufind.org */ @@ -267,6 +270,71 @@ class LuceneSyntaxHelper return trim(preg_replace_callback($regs, $callback, $string)); } + /** + * Extract search terms from a query string for spell checking. + * + * This will only handle the most often used simple cases. + * + * @param string $query Query string + * + * @return string + */ + public function extractSearchTerms($query) + { + $result = []; + $inQuotes = false; + $collected = ''; + $discardParens = 0; + // Discard local parameters + $query = preg_replace('/\{!.+?\}/', '', $query); + // Discard fuzziness and proximity indicators + $query = preg_replace('/\~[^\s]*/', '', $query); + $query = preg_replace('/\^[^\s]*/', '', $query); + $lastCh = ''; + foreach (str_split($query) as $ch) { + // Handle quotes (everything in quotes is considered part of search + // terms) + if ($ch == '"' && $lastCh != '\\') { + $inQuotes = !$inQuotes; + } + if (!$inQuotes) { + // Discard closing parenthesis for previously discarded opening ones + // to keep balance + if ($ch == ')' && $discardParens > 0) { + --$discardParens; + continue; + } + // Flush to result array on word break + if ($ch == ' ' && $collected !== '') { + $result[] = $collected; + $collected = ''; + continue; + } + // If we encounter ':', discard preceding string as it's a field name + if ($ch == ':') { + // Take into account any opening parenthesis we discard here + $discardParens += substr_count($collected, '('); + $collected = ''; + continue; + } + } + $collected .= $ch; + $lastCh = $ch; + } + // Flush final collected string + if ($collected !== '') { + $result[] = $collected; + } + // Discard any preceding pluses or minuses + $result = array_map( + function ($s) { + return ltrim($s, '+-'); + }, + $result + ); + return implode(' ', $result); + } + /** * Are there any case-sensitive Boolean operators configured? * diff --git a/module/VuFindSearch/src/VuFindSearch/Backend/Solr/QueryBuilder.php b/module/VuFindSearch/src/VuFindSearch/Backend/Solr/QueryBuilder.php index 29f228bad3b4fdc659b564714abf7cc34f35d7a9..1e1bace701d52b410b33e0f81a148e4b4c2588e1 100644 --- a/module/VuFindSearch/src/VuFindSearch/Backend/Solr/QueryBuilder.php +++ b/module/VuFindSearch/src/VuFindSearch/Backend/Solr/QueryBuilder.php @@ -120,10 +120,13 @@ class QueryBuilder implements QueryBuilderInterface { $params = new ParamBag(); - // Add spelling query if applicable -- note that we mus set this up before + // Add spelling query if applicable -- note that we must set this up before // we process the main query in order to avoid unwanted extra syntax: if ($this->createSpellingQuery) { - $params->set('spellcheck.q', $query->getAllTerms()); + $params->set( + 'spellcheck.q', + $this->getLuceneHelper()->extractSearchTerms($query->getAllTerms()) + ); } if ($query instanceof QueryGroup) { diff --git a/module/VuFindSearch/tests/unit-tests/src/VuFindTest/Backend/Solr/LuceneSyntaxHelperTest.php b/module/VuFindSearch/tests/unit-tests/src/VuFindTest/Backend/Solr/LuceneSyntaxHelperTest.php index 23fab9e15b97a1bde3acb531a9463d88c9938f41..f25106da0068a13c996dea7e29763da2f1d7017d 100644 --- a/module/VuFindSearch/tests/unit-tests/src/VuFindTest/Backend/Solr/LuceneSyntaxHelperTest.php +++ b/module/VuFindSearch/tests/unit-tests/src/VuFindTest/Backend/Solr/LuceneSyntaxHelperTest.php @@ -342,4 +342,38 @@ class LuceneSyntaxHelperTest extends \VuFindTest\Unit\TestCase $expected, $lh->normalizeSearchString($input) ); } + + /** + * Test search term extraction + * + * @return void + */ + public function testExtractSearchTerms() + { + $lh = new LuceneSyntaxHelper(false, false); + $tests = [ + 'keyword' => 'keyword', + 'two keywords' => 'two keywords', + 'index:keyword' => 'keyword', + 'index:keyword anotherkeyword' => 'keyword anotherkeyword', + 'index:keyword anotherindex:anotherkeyword' => 'keyword anotherkeyword', + '(index:keyword)' => 'keyword', + 'index:(keyword1 keyword2)' => '(keyword1 keyword2)', + '{!local params}keyword' => 'keyword', + 'keyword~' => 'keyword', + 'keyword~0.8' => 'keyword', + 'keyword keyword2^20' => 'keyword keyword2', + '"keyword keyword2 keyword3"~2' => '"keyword keyword2 keyword3"', + '"kw1 kw2 kw3"~2 kw4^200' => '"kw1 kw2 kw3" kw4', + '+keyword -keyword2^20' => 'keyword keyword2', + 'index:+keyword index2:-keyword2^20' => 'keyword keyword2', + 'index:[start TO end]' => '[start TO end]', + 'index:{start TO end}' => '{start TO end}', + 'es\\"caped field:test' => 'es\\"caped test' + ]; + foreach ($tests as $input => $expected) + $this->assertEquals( + $expected, $lh->extractSearchTerms($input) + ); + } }