diff --git a/module/VuFindSearch/src/VuFindSearch/Backend/Solr/SearchHandler.php b/module/VuFindSearch/src/VuFindSearch/Backend/Solr/SearchHandler.php index 1f7cde1b863a263f3633e7a996581134373b1eab..f7cea5ee5317a8dc8f61d0ae3cb0c8cac7f6aa8d 100644 --- a/module/VuFindSearch/src/VuFindSearch/Backend/Solr/SearchHandler.php +++ b/module/VuFindSearch/src/VuFindSearch/Backend/Solr/SearchHandler.php @@ -452,12 +452,16 @@ class SearchHandler */ protected function tokenize($string) { - // Tokenize on spaces and quotes (but ignore escaped quotes) - $phrases = []; - preg_match_all( - '/"(?:\\\\"|.)*?"[~[0-9]+]*|"(?:\\\\"|.)*?"|[^ ]+/', $string, $phrases - ); - $phrases = $phrases[0]; + // First replace escaped quotes with a non-printable character that will + // never be found in user input (ASCII 26, "substitute"). Next use a regex + // to split on whitespace and quoted phrases. Finally, swap the "substitute" + // characters back to escaped quotes. This allows for a simpler regex. + $string = str_replace('\\"', chr(26), $string); + preg_match_all('/[^\s"]+|"([^"]*)"/', $string, $phrases); + $callback = function ($str) { + return str_replace(chr(26), '\\"', $str); + }; + $phrases = array_map($callback, $phrases[0]); $tokens = []; $token = []; diff --git a/module/VuFindSearch/tests/unit-tests/src/VuFindTest/Backend/Solr/QueryBuilderTest.php b/module/VuFindSearch/tests/unit-tests/src/VuFindTest/Backend/Solr/QueryBuilderTest.php index 3b638c00177f99692ea8360e6e9a86292824da0b..28695c0114692ab17687b9c240bba654743e52af 100644 --- a/module/VuFindSearch/tests/unit-tests/src/VuFindTest/Backend/Solr/QueryBuilderTest.php +++ b/module/VuFindSearch/tests/unit-tests/src/VuFindTest/Backend/Solr/QueryBuilderTest.php @@ -335,4 +335,76 @@ class QueryBuilderTest extends \VuFindTest\Unit\TestCase $processedQ = $response->get('q'); $this->assertEquals('((field_a:(value*)^100 OR field_c:(value*)^200) OR (_query_:"{!dismax qf=\"field_b\" }value2"))', $processedQ[0]); } + + /** + * Test generation with multiple quoted phrases. + * + * @return void + */ + public function testMultipleQuotedPhrases() + { + $qb = new QueryBuilder( + [ + 'a' => [ + 'QueryFields' => [ + 'field_a' => [['or', '~']], + ] + ] + ] + ); + + $q = new Query('"foo" "bar" "baz"', 'a'); + + $response = $qb->build($q); + $processedQ = $response->get('q'); + $this->assertEquals('(field_a:("foo" OR "bar" OR "baz"))', $processedQ[0]); + } + + /** + * Test generation with mix of quoted and unquoted phrases + * + * @return void + */ + public function testMixedQuotedPhrases() + { + $qb = new QueryBuilder( + [ + 'a' => [ + 'QueryFields' => [ + 'field_a' => [['or', '~']], + ] + ] + ] + ); + + $q = new Query('708396 "708398" 708399 "708400"', 'a'); + + $response = $qb->build($q); + $processedQ = $response->get('q'); + $this->assertEquals('(field_a:(708396 OR "708398" OR 708399 OR "708400"))', $processedQ[0]); + } + + /** + * Test generation with mix of quoted and unquoted phrases + * + * @return void + */ + public function testMixedQuotedPhrasesWithEscapedQuote() + { + $qb = new QueryBuilder( + [ + 'a' => [ + 'QueryFields' => [ + 'field_a' => [['or', '~']], + ] + ] + ] + ); + + $q = new Query('708396 "708398" 708399 "foo\"bar"', 'a'); + + $response = $qb->build($q); + $processedQ = $response->get('q'); + $this->assertEquals('(field_a:(708396 OR "708398" OR 708399 OR "foo\"bar"))', $processedQ[0]); + } } \ No newline at end of file