From 35f5ef2dc534ce62ac442894fde2e92e4e1697fe Mon Sep 17 00:00:00 2001
From: Demian Katz <demian.katz@villanova.edu>
Date: Mon, 13 Jul 2015 14:34:18 -0400
Subject: [PATCH] Fixed tokenization bug; added tests.

---
 .../Backend/Solr/SearchHandler.php            | 16 +++--
 .../Backend/Solr/QueryBuilderTest.php         | 72 +++++++++++++++++++
 2 files changed, 82 insertions(+), 6 deletions(-)

diff --git a/module/VuFindSearch/src/VuFindSearch/Backend/Solr/SearchHandler.php b/module/VuFindSearch/src/VuFindSearch/Backend/Solr/SearchHandler.php
index 1f7cde1b863..f7cea5ee531 100644
--- a/module/VuFindSearch/src/VuFindSearch/Backend/Solr/SearchHandler.php
+++ b/module/VuFindSearch/src/VuFindSearch/Backend/Solr/SearchHandler.php
@@ -452,12 +452,16 @@ class SearchHandler
      */
     protected function tokenize($string)
     {
-        // Tokenize on spaces and quotes (but ignore escaped quotes)
-        $phrases = [];
-        preg_match_all(
-            '/"(?:\\\\"|.)*?"[~[0-9]+]*|"(?:\\\\"|.)*?"|[^ ]+/', $string, $phrases
-        );
-        $phrases = $phrases[0];
+        // First replace escaped quotes with a non-printable character that will
+        // never be found in user input (ASCII 26, "substitute"). Next use a regex
+        // to split on whitespace and quoted phrases. Finally, swap the "substitute"
+        // characters back to escaped quotes. This allows for a simpler regex.
+        $string = str_replace('\\"', chr(26), $string);
+        preg_match_all('/[^\s"]+|"([^"]*)"/', $string, $phrases);
+        $callback = function ($str) {
+            return str_replace(chr(26), '\\"', $str);
+        };
+        $phrases = array_map($callback, $phrases[0]);
 
         $tokens  = [];
         $token   = [];
diff --git a/module/VuFindSearch/tests/unit-tests/src/VuFindTest/Backend/Solr/QueryBuilderTest.php b/module/VuFindSearch/tests/unit-tests/src/VuFindTest/Backend/Solr/QueryBuilderTest.php
index 3b638c00177..28695c01146 100644
--- a/module/VuFindSearch/tests/unit-tests/src/VuFindTest/Backend/Solr/QueryBuilderTest.php
+++ b/module/VuFindSearch/tests/unit-tests/src/VuFindTest/Backend/Solr/QueryBuilderTest.php
@@ -335,4 +335,76 @@ class QueryBuilderTest extends \VuFindTest\Unit\TestCase
         $processedQ = $response->get('q');
         $this->assertEquals('((field_a:(value*)^100 OR field_c:(value*)^200) OR (_query_:"{!dismax qf=\"field_b\" }value2"))', $processedQ[0]);
     }
+
+    /**
+     * Test generation with multiple quoted phrases.
+     *
+     * @return void
+     */
+    public function testMultipleQuotedPhrases()
+    {
+        $qb = new QueryBuilder(
+            [
+                'a' => [
+                    'QueryFields' => [
+                        'field_a' => [['or', '~']],
+                    ]
+                ]
+            ]
+        );
+
+        $q = new Query('"foo" "bar" "baz"', 'a');
+
+        $response = $qb->build($q);
+        $processedQ = $response->get('q');
+        $this->assertEquals('(field_a:("foo" OR "bar" OR "baz"))', $processedQ[0]);
+    }
+
+    /**
+     * Test generation with mix of quoted and unquoted phrases
+     *
+     * @return void
+     */
+    public function testMixedQuotedPhrases()
+    {
+        $qb = new QueryBuilder(
+            [
+                'a' => [
+                    'QueryFields' => [
+                        'field_a' => [['or', '~']],
+                    ]
+                ]
+            ]
+        );
+
+        $q = new Query('708396 "708398" 708399 "708400"', 'a');
+
+        $response = $qb->build($q);
+        $processedQ = $response->get('q');
+        $this->assertEquals('(field_a:(708396 OR "708398" OR 708399 OR "708400"))', $processedQ[0]);
+    }
+
+    /**
+     * Test generation with mix of quoted and unquoted phrases
+     *
+     * @return void
+     */
+    public function testMixedQuotedPhrasesWithEscapedQuote()
+    {
+        $qb = new QueryBuilder(
+            [
+                'a' => [
+                    'QueryFields' => [
+                        'field_a' => [['or', '~']],
+                    ]
+                ]
+            ]
+        );
+
+        $q = new Query('708396 "708398" 708399 "foo\"bar"', 'a');
+
+        $response = $qb->build($q);
+        $processedQ = $response->get('q');
+        $this->assertEquals('(field_a:(708396 OR "708398" OR 708399 OR "foo\"bar"))', $processedQ[0]);
+    }
 }
\ No newline at end of file
-- 
GitLab