Skip to content
Snippets Groups Projects
Commit 35f5ef2d authored by Demian Katz's avatar Demian Katz
Browse files

Fixed tokenization bug; added tests.

parent eb7fe014
No related merge requests found
......@@ -452,12 +452,16 @@ class SearchHandler
*/
protected function tokenize($string)
{
// Tokenize on spaces and quotes (but ignore escaped quotes)
$phrases = [];
preg_match_all(
'/"(?:\\\\"|.)*?"[~[0-9]+]*|"(?:\\\\"|.)*?"|[^ ]+/', $string, $phrases
);
$phrases = $phrases[0];
// First replace escaped quotes with a non-printable character that will
// never be found in user input (ASCII 26, "substitute"). Next use a regex
// to split on whitespace and quoted phrases. Finally, swap the "substitute"
// characters back to escaped quotes. This allows for a simpler regex.
$string = str_replace('\\"', chr(26), $string);
preg_match_all('/[^\s"]+|"([^"]*)"/', $string, $phrases);
$callback = function ($str) {
return str_replace(chr(26), '\\"', $str);
};
$phrases = array_map($callback, $phrases[0]);
$tokens = [];
$token = [];
......
......@@ -335,4 +335,76 @@ class QueryBuilderTest extends \VuFindTest\Unit\TestCase
$processedQ = $response->get('q');
$this->assertEquals('((field_a:(value*)^100 OR field_c:(value*)^200) OR (_query_:"{!dismax qf=\"field_b\" }value2"))', $processedQ[0]);
}
/**
* Test generation with multiple quoted phrases.
*
* @return void
*/
public function testMultipleQuotedPhrases()
{
$qb = new QueryBuilder(
[
'a' => [
'QueryFields' => [
'field_a' => [['or', '~']],
]
]
]
);
$q = new Query('"foo" "bar" "baz"', 'a');
$response = $qb->build($q);
$processedQ = $response->get('q');
$this->assertEquals('(field_a:("foo" OR "bar" OR "baz"))', $processedQ[0]);
}
/**
* Test generation with mix of quoted and unquoted phrases
*
* @return void
*/
public function testMixedQuotedPhrases()
{
$qb = new QueryBuilder(
[
'a' => [
'QueryFields' => [
'field_a' => [['or', '~']],
]
]
]
);
$q = new Query('708396 "708398" 708399 "708400"', 'a');
$response = $qb->build($q);
$processedQ = $response->get('q');
$this->assertEquals('(field_a:(708396 OR "708398" OR 708399 OR "708400"))', $processedQ[0]);
}
/**
* Test generation with mix of quoted and unquoted phrases
*
* @return void
*/
public function testMixedQuotedPhrasesWithEscapedQuote()
{
$qb = new QueryBuilder(
[
'a' => [
'QueryFields' => [
'field_a' => [['or', '~']],
]
]
]
);
$q = new Query('708396 "708398" 708399 "foo\"bar"', 'a');
$response = $qb->build($q);
$processedQ = $response->get('q');
$this->assertEquals('(field_a:(708396 OR "708398" OR 708399 OR "foo\"bar"))', $processedQ[0]);
}
}
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment