From 35f5ef2dc534ce62ac442894fde2e92e4e1697fe Mon Sep 17 00:00:00 2001 From: Demian Katz <demian.katz@villanova.edu> Date: Mon, 13 Jul 2015 14:34:18 -0400 Subject: [PATCH] Fixed tokenization bug; added tests. --- .../Backend/Solr/SearchHandler.php | 16 +++-- .../Backend/Solr/QueryBuilderTest.php | 72 +++++++++++++++++++ 2 files changed, 82 insertions(+), 6 deletions(-) diff --git a/module/VuFindSearch/src/VuFindSearch/Backend/Solr/SearchHandler.php b/module/VuFindSearch/src/VuFindSearch/Backend/Solr/SearchHandler.php index 1f7cde1b863..f7cea5ee531 100644 --- a/module/VuFindSearch/src/VuFindSearch/Backend/Solr/SearchHandler.php +++ b/module/VuFindSearch/src/VuFindSearch/Backend/Solr/SearchHandler.php @@ -452,12 +452,16 @@ class SearchHandler */ protected function tokenize($string) { - // Tokenize on spaces and quotes (but ignore escaped quotes) - $phrases = []; - preg_match_all( - '/"(?:\\\\"|.)*?"[~[0-9]+]*|"(?:\\\\"|.)*?"|[^ ]+/', $string, $phrases - ); - $phrases = $phrases[0]; + // First replace escaped quotes with a non-printable character that will + // never be found in user input (ASCII 26, "substitute"). Next use a regex + // to split on whitespace and quoted phrases. Finally, swap the "substitute" + // characters back to escaped quotes. This allows for a simpler regex. + $string = str_replace('\\"', chr(26), $string); + preg_match_all('/[^\s"]+|"([^"]*)"/', $string, $phrases); + $callback = function ($str) { + return str_replace(chr(26), '\\"', $str); + }; + $phrases = array_map($callback, $phrases[0]); $tokens = []; $token = []; diff --git a/module/VuFindSearch/tests/unit-tests/src/VuFindTest/Backend/Solr/QueryBuilderTest.php b/module/VuFindSearch/tests/unit-tests/src/VuFindTest/Backend/Solr/QueryBuilderTest.php index 3b638c00177..28695c01146 100644 --- a/module/VuFindSearch/tests/unit-tests/src/VuFindTest/Backend/Solr/QueryBuilderTest.php +++ b/module/VuFindSearch/tests/unit-tests/src/VuFindTest/Backend/Solr/QueryBuilderTest.php @@ -335,4 +335,76 @@ class QueryBuilderTest extends \VuFindTest\Unit\TestCase $processedQ = $response->get('q'); $this->assertEquals('((field_a:(value*)^100 OR field_c:(value*)^200) OR (_query_:"{!dismax qf=\"field_b\" }value2"))', $processedQ[0]); } + + /** + * Test generation with multiple quoted phrases. + * + * @return void + */ + public function testMultipleQuotedPhrases() + { + $qb = new QueryBuilder( + [ + 'a' => [ + 'QueryFields' => [ + 'field_a' => [['or', '~']], + ] + ] + ] + ); + + $q = new Query('"foo" "bar" "baz"', 'a'); + + $response = $qb->build($q); + $processedQ = $response->get('q'); + $this->assertEquals('(field_a:("foo" OR "bar" OR "baz"))', $processedQ[0]); + } + + /** + * Test generation with mix of quoted and unquoted phrases + * + * @return void + */ + public function testMixedQuotedPhrases() + { + $qb = new QueryBuilder( + [ + 'a' => [ + 'QueryFields' => [ + 'field_a' => [['or', '~']], + ] + ] + ] + ); + + $q = new Query('708396 "708398" 708399 "708400"', 'a'); + + $response = $qb->build($q); + $processedQ = $response->get('q'); + $this->assertEquals('(field_a:(708396 OR "708398" OR 708399 OR "708400"))', $processedQ[0]); + } + + /** + * Test generation with mix of quoted and unquoted phrases + * + * @return void + */ + public function testMixedQuotedPhrasesWithEscapedQuote() + { + $qb = new QueryBuilder( + [ + 'a' => [ + 'QueryFields' => [ + 'field_a' => [['or', '~']], + ] + ] + ] + ); + + $q = new Query('708396 "708398" 708399 "foo\"bar"', 'a'); + + $response = $qb->build($q); + $processedQ = $response->get('q'); + $this->assertEquals('(field_a:(708396 OR "708398" OR 708399 OR "foo\"bar"))', $processedQ[0]); + } } \ No newline at end of file -- GitLab