From 7b84dd36e07a09a6dc96e87cbe377a57d3d06864 Mon Sep 17 00:00:00 2001 From: Demian Katz <demian.katz@villanova.edu> Date: Wed, 26 Jun 2013 14:35:37 -0400 Subject: [PATCH] Progress on VUFIND-454: basic website indexing/search. --- config/vufind/websearchspecs.yaml | 48 ++ config/vufind/website.ini | 25 + import/sitemap.properties | 24 + import/xsl/sitemap.xsl | 17 + module/VuFind/config/module.config.php | 11 + .../src/VuFind/Controller/WebController.php | 61 ++ .../src/VuFind/RecordDriver/SolrWeb.php | 77 ++ .../Search/Factory/SolrWebBackendFactory.php | 75 ++ .../src/VuFind/Search/SolrWeb/Options.php | 73 ++ .../src/VuFind/Search/SolrWeb/Params.php | 41 ++ .../src/VuFind/Search/SolrWeb/Results.php | 52 ++ .../src/VuFind/XSLT/Import/VuFindSitemap.php | 257 +++++++ solr/solr.xml | 1 + solr/website/conf/elevate.xml | 25 + solr/website/conf/protwords.txt | 0 solr/website/conf/schema.xml | 152 ++++ solr/website/conf/solrconfig.xml | 676 ++++++++++++++++++ solr/website/conf/stopwords.txt | 34 + solr/website/conf/synonyms.txt | 5 + .../RecordDriver/SolrWeb/result-list.phtml | 33 + themes/blueprint/templates/web/home.phtml | 1 + themes/blueprint/templates/web/results.phtml | 5 + .../RecordDriver/SolrWeb/result-list.phtml | 26 + themes/jquerymobile/templates/web/home.phtml | 4 + .../jquerymobile/templates/web/results.phtml | 7 + 25 files changed, 1730 insertions(+) create mode 100644 config/vufind/websearchspecs.yaml create mode 100644 config/vufind/website.ini create mode 100644 import/sitemap.properties create mode 100644 import/xsl/sitemap.xsl create mode 100644 module/VuFind/src/VuFind/Controller/WebController.php create mode 100644 module/VuFind/src/VuFind/RecordDriver/SolrWeb.php create mode 100644 module/VuFind/src/VuFind/Search/Factory/SolrWebBackendFactory.php create mode 100644 module/VuFind/src/VuFind/Search/SolrWeb/Options.php create mode 100644 module/VuFind/src/VuFind/Search/SolrWeb/Params.php create mode 100644 module/VuFind/src/VuFind/Search/SolrWeb/Results.php create mode 100644 module/VuFind/src/VuFind/XSLT/Import/VuFindSitemap.php create mode 100644 solr/website/conf/elevate.xml create mode 100644 solr/website/conf/protwords.txt create mode 100644 solr/website/conf/schema.xml create mode 100644 solr/website/conf/solrconfig.xml create mode 100644 solr/website/conf/stopwords.txt create mode 100644 solr/website/conf/synonyms.txt create mode 100644 themes/blueprint/templates/RecordDriver/SolrWeb/result-list.phtml create mode 100644 themes/blueprint/templates/web/home.phtml create mode 100644 themes/blueprint/templates/web/results.phtml create mode 100644 themes/jquerymobile/templates/RecordDriver/SolrWeb/result-list.phtml create mode 100644 themes/jquerymobile/templates/web/home.phtml create mode 100644 themes/jquerymobile/templates/web/results.phtml diff --git a/config/vufind/websearchspecs.yaml b/config/vufind/websearchspecs.yaml new file mode 100644 index 00000000000..263e546748f --- /dev/null +++ b/config/vufind/websearchspecs.yaml @@ -0,0 +1,48 @@ +--- +# This file contains search specifications used by the website module; for details +# on the workings of this file, see the comments in searchspecs.yaml. +#----------------------------------------------------------------------------------- + +AllFields: + DismaxFields: + - title^750 + - description_unstemmed^350 + - description^300 + - keywords_unstemmed^250 + - keywords^200 + - url_keywords^50 + - fulltext_unstemmed^10 + - fulltext + QueryFields: + title: + - [onephrase, 1000] + - [and, 750] + - [or, 10] + description_unstemmed: + - [onephrase, 400] + - [and, 350] + - [or, ~] + description: + - [onephrase, 350] + - [and, 300] + - [or, ~] + keywords_unstemmed: + - [onephrase, 300] + - [and, 250] + - [or, ~] + keywords: + - [onephrase, 250] + - [and, 200] + - [or, ~] + url_keywords: + - [onephrase, 100] + - [and, 50] + - [or, ~] + fulltext_unstemmed: + - [onephrase, 50] + - [and, 10] + - [or, ~] + fulltext: + - [onephrase, 25] + - [and, 5] + - [or, ~] diff --git a/config/vufind/website.ini b/config/vufind/website.ini new file mode 100644 index 00000000000..cb9638e67f4 --- /dev/null +++ b/config/vufind/website.ini @@ -0,0 +1,25 @@ +; This configuration file controls the Web module (for searching your website); see +; facets.ini and searches.ini for detailed comments on the meaning of all settings. +[General] +default_handler = AllFields ; Search handler to use if none is specified +default_sort = relevance +case_sensitive_bools = true +default_side_recommend[] = SideFacets:Facets:CheckboxFacets:website +default_side_recommend[] = CatalogResults +highlighting = true +snippets = true + +[Basic_Searches] +AllFields = "All Fields" + +[Advanced_Searches] +AllFields = "All Fields" + +[Sorting] +relevance = sort_relevance +title = sort_title + +[Facets] +category = "Category" +linktype = "Link Type" +subject = "Subject" diff --git a/import/sitemap.properties b/import/sitemap.properties new file mode 100644 index 00000000000..ea7a8eaf76a --- /dev/null +++ b/import/sitemap.properties @@ -0,0 +1,24 @@ +; XSLT Import Settings for sitemaps (used for populating the special website core) +[General] +; REQUIRED: Name of XSLT file to apply. Path is relative to the import/xsl directory +; of the VuFind installation (or your local override directory). +xslt = sitemap.xsl +; OPTIONAL: PHP function(s) to register for use within XSLT file. You may repeat +; this line to register multiple PHP functions. +;php_function[] = str_replace +; OPTIONAL: PHP class filled with public static functions for use by the XSLT file. +; The class must live within a PHP namespace. You may specify a fully-qualified +; name; if you do not include a namespace, the class will automatically be assumed +; to live in the \VuFind\XSLT\Import namespace. +custom_class[] = VuFindSitemap +; OPTIONAL: If true, all custom_class settings above will be passed to the XSLT with +; their namespaces stripped off; for example, \VuFind\XSLT\Import\VuFind would be +; treated as \VuFind in XSLT files. This allows more compact syntax within XSLT +; files, but it can lead to name conflicts if used carelessly. If set to false, you +; must use fully-qualified names in your XSLT. The false setting is recommended, but +; the current default is "true" for compatibility with legacy configurations. +truncate_custom_class = true + +; XSLT parameters -- any key/value pairs set here will be passed as parameters to +; the XSLT file, allowing local values to be set without modifying XSLT code. +[Parameters] diff --git a/import/xsl/sitemap.xsl b/import/xsl/sitemap.xsl new file mode 100644 index 00000000000..9216aa46713 --- /dev/null +++ b/import/xsl/sitemap.xsl @@ -0,0 +1,17 @@ +<!-- XSLT to load sitemap entries into the custom class used for + populating the special "website" Solr core --> +<xsl:stylesheet version="1.0" + xmlns:xsl="http://www.w3.org/1999/XSL/Transform" + xmlns:sitemap="http://www.sitemaps.org/schemas/sitemap/0.9" + xmlns:php="http://php.net/xsl"> + <xsl:output method="xml" indent="yes" encoding="utf-8"/> + <xsl:template match="sitemap:urlset"> + <add> + <xsl:for-each select="//sitemap:loc"> + <doc> + <xsl:value-of disable-output-escaping="yes" select="php:function('VuFindSitemap::getDocument', normalize-space(string(.)))"/> + </doc> + </xsl:for-each> + </add> + </xsl:template> +</xsl:stylesheet> diff --git a/module/VuFind/config/module.config.php b/module/VuFind/config/module.config.php index 2172f69990f..8d4799bffdd 100644 --- a/module/VuFind/config/module.config.php +++ b/module/VuFind/config/module.config.php @@ -108,6 +108,7 @@ $config = array( 'tag' => 'VuFind\Controller\TagController', 'upgrade' => 'VuFind\Controller\UpgradeController', 'vudl' => 'VuFind\Controller\VudlController', + 'web' => 'VuFind\Controller\WebController', 'worldcat' => 'VuFind\Controller\WorldcatController', 'worldcatrecord' => 'VuFind\Controller\WorldcatrecordController', ), @@ -677,6 +678,14 @@ $config = array( $sm->getServiceLocator()->get('VuFind\Config')->get('searches') ); }, + 'solrweb' => function ($sm) { + return new \VuFind\RecordDriver\SolrWeb( + $sm->getServiceLocator()->get('VuFind\Config')->get('config'), + null, + $sm->getServiceLocator()->get('VuFind\Config')->get('website'), + $sm->getServiceLocator()->get('VuFind\Config')->get('website') + ); + }, 'summon' => function ($sm) { $summon = $sm->getServiceLocator()->get('VuFind\Config')->get('Summon'); $driver = new \VuFind\RecordDriver\Summon( @@ -816,6 +825,7 @@ $config = array( 'SolrAuth' => 'VuFind\Search\Factory\SolrAuthBackendFactory', 'SolrReserves' => 'VuFind\Search\Factory\SolrReservesBackendFactory', 'SolrStats' => 'VuFind\Search\Factory\SolrStatsBackendFactory', + 'SolrWeb' => 'VuFind\Search\Factory\SolrWebBackendFactory', 'Summon' => 'VuFind\Search\Factory\SummonBackendFactory', 'WorldCat' => 'VuFind\Search\Factory\WorldCatBackendFactory', ), @@ -975,6 +985,7 @@ $staticRoutes = array( 'Upgrade/GetDBCredentials', 'Upgrade/GetDbEncodingPreference', 'Upgrade/GetSourceDir', 'Upgrade/Reset', 'Upgrade/ShowSQL', 'VuDL/Browse', 'VuDL/DSRecord', 'VuDL/Record', + 'Web/Home', 'Web/Results', 'Worldcat/Advanced', 'Worldcat/Home', 'Worldcat/Search' ); diff --git a/module/VuFind/src/VuFind/Controller/WebController.php b/module/VuFind/src/VuFind/Controller/WebController.php new file mode 100644 index 00000000000..97354395182 --- /dev/null +++ b/module/VuFind/src/VuFind/Controller/WebController.php @@ -0,0 +1,61 @@ +<?php +/** + * Web Controller + * + * PHP version 5 + * + * Copyright (C) Villanova University 2010. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * @category VuFind2 + * @package Controller + * @author Demian Katz <demian.katz@villanova.edu> + * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License + * @link http://vufind.org Main Site + */ +namespace VuFind\Controller; + +/** + * Web Controller + * + * @category VuFind2 + * @package Controller + * @author Demian Katz <demian.katz@villanova.edu> + * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License + * @link http://vufind.org Main Site + */ +class WebController extends AbstractSearch +{ + /** + * Constructor + */ + public function __construct() + { + $this->searchClassId = 'SolrWeb'; + parent::__construct(); + } + + /** + * Home action + * + * @return \Zend\View\Model\ViewModel + */ + public function homeAction() + { + // Do nothing -- just display template + return $this->createViewModel(); + } +} + diff --git a/module/VuFind/src/VuFind/RecordDriver/SolrWeb.php b/module/VuFind/src/VuFind/RecordDriver/SolrWeb.php new file mode 100644 index 00000000000..29cd836f106 --- /dev/null +++ b/module/VuFind/src/VuFind/RecordDriver/SolrWeb.php @@ -0,0 +1,77 @@ +<?php +/** + * Model for Solr web records. + * + * PHP version 5 + * + * Copyright (C) Villanova University 2011. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * @category VuFind2 + * @package RecordDrivers + * @author Demian Katz <demian.katz@villanova.edu> + * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License + * @link http://vufind.org/wiki/vufind2:record_drivers Wiki + */ +namespace VuFind\RecordDriver; + +/** + * Model for Solr web records. + * + * @category VuFind2 + * @package RecordDrivers + * @author Demian Katz <demian.katz@villanova.edu> + * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License + * @link http://vufind.org/wiki/vufind2:record_drivers Wiki + */ +class SolrWeb extends SolrDefault +{ + /** + * Constructor + * + * @param \Zend\Config\Config $mainConfig VuFind main configuration (omit for + * built-in defaults) + * @param \Zend\Config\Config $recordConfig Record-specific configuration file + * (omit to use $mainConfig as $recordConfig) + * @param \Zend\Config\Config $searchSettings Search-specific configuration file + */ + public function __construct($mainConfig = null, $recordConfig = null, + $searchSettings = null + ) { + $this->preferredSnippetFields = array('description', 'fulltext'); + parent::__construct($mainConfig, $recordConfig, $searchSettings); + } + + /** + * Get text that can be displayed to represent this record in + * breadcrumbs. + * + * @return string Breadcrumb text to represent this record. + */ + public function getBreadcrumb() + { + return $this->getTitle(); + } + + /** + * Get the URL for the current record. + * + * @return string + */ + public function getUrl() + { + return $this->fields['url']; + } +} diff --git a/module/VuFind/src/VuFind/Search/Factory/SolrWebBackendFactory.php b/module/VuFind/src/VuFind/Search/Factory/SolrWebBackendFactory.php new file mode 100644 index 00000000000..7c6336380d8 --- /dev/null +++ b/module/VuFind/src/VuFind/Search/Factory/SolrWebBackendFactory.php @@ -0,0 +1,75 @@ +<?php + +/** + * Factory for the website SOLR backend. + * + * PHP version 5 + * + * Copyright (C) Villanova University 2013. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * @category VuFind2 + * @package Search + * @author Demian Katz <demian.katz@villanova.edu> + * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License + * @link http://vufind.org Main Site + */ +namespace VuFind\Search\Factory; +use VuFindSearch\Backend\Solr\Response\Json\RecordCollectionFactory; +use VuFindSearch\Backend\Solr\Connector; + +/** + * Factory for the website SOLR backend. + * + * @category VuFind2 + * @package Search + * @author Demian Katz <demian.katz@villanova.edu> + * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License + * @link http://vufind.org Main Site + */ +class SolrWebBackendFactory extends AbstractSolrBackendFactory +{ + /** + * Constructor + */ + public function __construct() + { + parent::__construct(); + $this->solrCore = 'website'; + $this->searchConfig = 'website'; + $this->searchYaml = 'websearchspecs.yaml'; + } + + /** + * Create the SOLR backend. + * + * @param Connector $connector Connector + * + * @return \VuFindSearch\Backend\Solr\Backend + */ + protected function createBackend(Connector $connector) + { + $backend = parent::createBackend($connector); + $manager = $this->serviceLocator->get('VuFind\RecordDriverPluginManager'); + $callback = function ($data) use ($manager) { + $driver = $manager->get('SolrWeb'); + $driver->setRawData($data); + return $driver; + }; + $factory = new RecordCollectionFactory($callback); + $backend->setRecordCollectionFactory($factory); + return $backend; + } +} \ No newline at end of file diff --git a/module/VuFind/src/VuFind/Search/SolrWeb/Options.php b/module/VuFind/src/VuFind/Search/SolrWeb/Options.php new file mode 100644 index 00000000000..58ec34db8b6 --- /dev/null +++ b/module/VuFind/src/VuFind/Search/SolrWeb/Options.php @@ -0,0 +1,73 @@ +<?php +/** + * Solr Web aspect of the Search Multi-class (Options) + * + * PHP version 5 + * + * Copyright (C) Villanova University 2011. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * @category VuFind2 + * @package Search_SolrWeb + * @author Demian Katz <demian.katz@villanova.edu> + * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License + * @link http://www.vufind.org Main Page + */ +namespace VuFind\Search\SolrWeb; + +/** + * Solr Web Search Options + * + * @category VuFind2 + * @package Search_SolrWeb + * @author Demian Katz <demian.katz@villanova.edu> + * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License + * @link http://www.vufind.org Main Page + */ +class Options extends \VuFind\Search\Solr\Options +{ + /** + * Constructor + * + * @param \VuFind\Config\PluginManager $configLoader Config loader + */ + public function __construct(\VuFind\Config\PluginManager $configLoader) + { + $this->facetsIni = $this->searchIni = 'website'; + parent::__construct($configLoader); + } + + /** + * Return the route name for the search results action. + * + * @return string + */ + public function getSearchAction() + { + return 'web-results'; + } + + /** + * Return the route name of the action used for performing advanced searches. + * Returns false if the feature is not supported. + * + * @return string|bool + */ + public function getAdvancedSearchAction() + { + // Not currently supported: + return false; + } +} \ No newline at end of file diff --git a/module/VuFind/src/VuFind/Search/SolrWeb/Params.php b/module/VuFind/src/VuFind/Search/SolrWeb/Params.php new file mode 100644 index 00000000000..f8369390625 --- /dev/null +++ b/module/VuFind/src/VuFind/Search/SolrWeb/Params.php @@ -0,0 +1,41 @@ +<?php +/** + * Solr Web aspect of the Search Multi-class (Params) + * + * PHP version 5 + * + * Copyright (C) Villanova University 2011. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * @category VuFind2 + * @package Search_SolrWeb + * @author Demian Katz <demian.katz@villanova.edu> + * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License + * @link http://www.vufind.org Main Page + */ +namespace VuFind\Search\SolrWeb; + +/** + * Solr Web Search Parameters + * + * @category VuFind2 + * @package Search_SolrWeb + * @author Demian Katz <demian.katz@villanova.edu> + * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License + * @link http://www.vufind.org Main Page + */ +class Params extends \VuFind\Search\Solr\Params +{ +} \ No newline at end of file diff --git a/module/VuFind/src/VuFind/Search/SolrWeb/Results.php b/module/VuFind/src/VuFind/Search/SolrWeb/Results.php new file mode 100644 index 00000000000..4b8c9b41116 --- /dev/null +++ b/module/VuFind/src/VuFind/Search/SolrWeb/Results.php @@ -0,0 +1,52 @@ +<?php +/** + * Solr Web aspect of the Search Multi-class (Results) + * + * PHP version 5 + * + * Copyright (C) Villanova University 2011. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * @category VuFind2 + * @package Search_SolrWeb + * @author Demian Katz <demian.katz@villanova.edu> + * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License + * @link http://www.vufind.org Main Page + */ +namespace VuFind\Search\SolrWeb; + +/** + * Solr Web Search Parameters + * + * @category VuFind2 + * @package Search_SolrWeb + * @author Demian Katz <demian.katz@villanova.edu> + * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License + * @link http://www.vufind.org Main Page + */ +class Results extends \VuFind\Search\Solr\Results +{ + /** + * Constructor + * + * @param \VuFind\Search\Base\Params $params Object representing user search + * parameters. + */ + public function __construct(\VuFind\Search\Base\Params $params) + { + parent::__construct($params); + $this->backendId = 'SolrWeb'; + } +} \ No newline at end of file diff --git a/module/VuFind/src/VuFind/XSLT/Import/VuFindSitemap.php b/module/VuFind/src/VuFind/XSLT/Import/VuFindSitemap.php new file mode 100644 index 00000000000..cad65240405 --- /dev/null +++ b/module/VuFind/src/VuFind/XSLT/Import/VuFindSitemap.php @@ -0,0 +1,257 @@ +<?php +/** + * XSLT importer support methods for sitemaps. + * + * PHP version 5 + * + * Copyright (c) Demian Katz 2010. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * @category VuFind2 + * @package Import_Tools + * @author Demian Katz <demian.katz@villanova.edu> + * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License + * @link http://vufind.org/wiki/importing_records Wiki + */ +namespace VuFind\XSLT\Import; + +/** + * XSLT support class -- all methods of this class must be public and static; + * they will be automatically made available to your XSL stylesheet for use + * with the php:function() function. + * + * @category VuFind2 + * @package Import_Tools + * @author Demian Katz <demian.katz@villanova.edu> + * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License + * @link http://vufind.org/wiki/importing_records Wiki + */ +class VuFindSitemap extends VuFind +{ + /** + * Load metadata about an HTML document using Aperture. + * + * @param string $htmlFile File on disk containing HTML. + * + * @return array + */ + protected static function getApertureFields($htmlFile) + { + $xmlFile = tempnam('/tmp', 'apt'); + $cmd = static::getApertureCommand($htmlFile, $xmlFile, 'filecrawler'); + exec($cmd); + + // If we failed to process the file, give up now: + if (!file_exists($xmlFile)) { + throw new \Exception('Aperture failed.'); + } + + // Extract and decode the full text from the XML: + $xml = file_get_contents($xmlFile); + @unlink($xmlFile); + preg_match('/<plainTextContent[^>]*>([^<]*)</ms', $xml, $matches); + $final = isset($matches[1]) ? + html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8') : ''; + + // Extract the title from the XML: + preg_match('/<title[^>]*>([^<]*)</ms', $xml, $matches); + $title = isset($matches[1]) ? + html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8') : ''; + + // Extract the keywords from the XML: + preg_match_all('/<keyword[^>]*>([^<]*)</ms', $xml, $matches); + $keywords = array(); + if (isset($matches[1])) { + foreach($matches[1] as $current) { + $keywords[] = html_entity_decode($current, ENT_QUOTES, 'UTF-8'); + } + } + + // Extract the description from the XML: + preg_match('/<description[^>]*>([^<]*)</ms', $xml, $matches); + $description = isset($matches[1]) + ? html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8') : ''; + + // Send back the extracted fields: + return array( + 'title' => $title, + 'keywords' => $keywords, + 'description' => $description, + 'fulltext' => $final, + ); + } + + /** + * Load metadata about an HTML document using Tika. + * + * @param string $htmlFile File on disk containing HTML. + * + * @return array + */ + protected static function getTikaFields($htmlFile) + { + // Extract and decode the full text from the XML: + $xml = static::harvestWithTika($htmlFile, '--xml'); + + // Extract the title from the XML: + preg_match('/<title[^>]*>([^<]*)</ms', $xml, $matches); + $title = isset($matches[1]) ? + html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8') : ''; + + // Extract the keywords from the XML: + preg_match_all( + '/<meta name="keywords" content="([^"]*)"/ms', $xml, $matches + ); + $keywords = array(); + if (isset($matches[1])) { + foreach($matches[1] as $current) { + $keywords[] = html_entity_decode($current, ENT_QUOTES, 'UTF-8'); + } + } + + // Extract the description from the XML: + preg_match('/<meta name="description" content="([^"]*)"/ms', $xml, $matches); + $description = isset($matches[1]) + ? html_entity_decode($matches[1], ENT_QUOTES, 'UTF-8') : ''; + + // Send back the extracted fields: + return array( + 'title' => $title, + 'keywords' => $keywords, + 'description' => $description, + 'fulltext' => $title . ' ' . static::harvestWithTika($htmlFile), + ); + } + + /** + * Extract key metadata from HTML. + * + * @param string $html HTML content. + * + * @return array + */ + protected static function getHtmlFields($html) + { + // Extract the subjects from the HTML: + preg_match_all('/<meta name="subject" content="([^"]*)"/ms', $html, $matches); + $subjects = array(); + if (isset($matches[1])) { + foreach($matches[1] as $current) { + $subjects[] = html_entity_decode($current, ENT_QUOTES, 'UTF-8'); + } + } + + // Extract the link types from the HTML: + preg_match_all('/<meta name="category" content="([^"]*)"/ms', $html, $matches); + $categories = array(); + if (isset($matches[1])) { + foreach($matches[1] as $current) { + $categories[] = html_entity_decode($current, ENT_QUOTES, 'UTF-8'); + } + } + + // Extract the use count from the HTML: + preg_match_all('/<meta name="useCount" content="([^"]*)"/ms', $html, $matches); + $linkTypes = array(); + $useCount = isset($matches[1][0]) ? $matches[1][0] : 1; + + return array( + 'category' => $categories, + 'subject' => $subjects, + 'use_count' => $useCount, + ); + } + + /** + * Convert an associative array of fields into a Solr document. + * + * @param array $fields Field data + * + * @return string + */ + public static function arrayToSolrXml($fields) + { + $xml = ''; + foreach ($fields as $key => $value) { + $value = is_array($value) ? $value : array($value); + foreach ($value as $current) { + if (!empty($current)) { + $xml .= '<field name="' . $key . '">' + . htmlspecialchars($current) . '</field>'; + } + } + } + return $xml; + } + + /** + * Harvest the contents of a document file (PDF, Word, etc.) using Aperture. + * This method will only work if Aperture is properly configured in the + * web/conf/fulltext.ini file. Without proper configuration, this will + * simply return an empty string. + * + * @param string $url URL of file to retrieve. + * + * @return string text contents of file. + * @access public + */ + public static function getDocument($url) + { + $parser = static::getParser(); + if ($parser == 'None') { + return ''; + } + + // Grab the HTML and write it to disk: + $htmlFile = tempnam('/tmp', 'htm'); + $html = file_get_contents($url); + file_put_contents($htmlFile, $html); + + // Use the appropriate full text parser: + switch ($parser) { + case 'Aperture': + $fields = static::getApertureFields($htmlFile); + break; + case 'Tika': + $fields = static::getTikaFields($htmlFile); + break; + default: + throw new \Exception('Unexpected parser: ' . $parser); + } + + // Clean up HTML file: + @unlink($htmlFile); + + // Add data loaded directly from HTML: + $fields += static::getHtmlFields($html); + + // Clean up/normalize full text: + $fields['fulltext'] = trim( + preg_replace( + '/\s+/', ' ', static::stripBadChars($fields['fulltext']) + ) + ); + + // Use a hash of the URL for the ID: + $fields['id'] = md5($url); + + // Add other key values: + $fields['url'] = $url; + $fields['last_indexed'] = date('Y-m-d\TH:i:s\Z'); + + // Turn the array into XML: + return static::arrayToSolrXml($fields); + } +} diff --git a/solr/solr.xml b/solr/solr.xml index 694a3c36eb6..41f6d035aef 100644 --- a/solr/solr.xml +++ b/solr/solr.xml @@ -3,6 +3,7 @@ <core name="biblio" instanceDir="biblio"/> <core name="authority" instanceDir="authority"/> <core name="stats" instanceDir="stats"/> + <!-- uncomment to enable website core: <core name="website" instanceDir="website"/> --> <!-- uncomment to enable reserves core: <core name="reserves" instanceDir="reserves"/> --> </cores> </solr> diff --git a/solr/website/conf/elevate.xml b/solr/website/conf/elevate.xml new file mode 100644 index 00000000000..b5ccaaea785 --- /dev/null +++ b/solr/website/conf/elevate.xml @@ -0,0 +1,25 @@ +<?xml version="1.0" encoding="UTF-8" ?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<!-- If this file is found in the config directory, it will only be + loaded once at startup. If it is found in Solr's data + directory, it will be re-loaded every commit. +--> + +<elevate> +</elevate> diff --git a/solr/website/conf/protwords.txt b/solr/website/conf/protwords.txt new file mode 100644 index 00000000000..e69de29bb2d diff --git a/solr/website/conf/schema.xml b/solr/website/conf/schema.xml new file mode 100644 index 00000000000..7931461bada --- /dev/null +++ b/solr/website/conf/schema.xml @@ -0,0 +1,152 @@ +<?xml version="1.0" ?> +<schema name="VuFind Website Index" version="1.2"> + <types> + <!-- Define fieldType long as it is needed by the _version_ field required by Solr 4.x --> + <fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/> + <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/> + <fieldType name="sint" class="solr.SortableIntField" sortMissingLast="true" omitNorms="true"/> + <fieldType name="textFacet" class="solr.TextField" sortMissingLast="true" omitNorms="true"> + <analyzer> + <tokenizer class="solr.KeywordTokenizerFactory"/> + <!-- strip trailing punctuation from facets: --> + <filter class="solr.PatternReplaceFilterFactory" pattern="(?<!\b[A-Z])[.\s]*$" replacement="" replace="first"/> + </analyzer> + </fieldType> + <fieldType name="text" class="solr.TextField" positionIncrementGap="100"> + <analyzer type="index"> + <tokenizer class="solr.ICUTokenizerFactory"/> + <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/> + <filter class="solr.ICUFoldingFilterFactory"/> + <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> + <filter class="solr.SnowballPorterFilterFactory" language="English"/> + <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> + </analyzer> + <analyzer type="query"> + <tokenizer class="solr.ICUTokenizerFactory"/> + <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> + <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/> + <filter class="solr.ICUFoldingFilterFactory"/> + <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> + <filter class="solr.SnowballPorterFilterFactory" language="English"/> + <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> + </analyzer> + </fieldType> + <!-- Text Field without Stemming and Synonyms --> + <fieldType name="textProper" class="solr.TextField" positionIncrementGap="100"> + <analyzer type="index"> + <tokenizer class="solr.ICUTokenizerFactory"/> + <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/> + <filter class="solr.ICUFoldingFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/> + <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> + </analyzer> + <analyzer type="query"> + <tokenizer class="solr.ICUTokenizerFactory"/> + <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"/> + <filter class="solr.ICUFoldingFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/> + <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> + </analyzer> + </fieldType> + <!-- Basic Text Field for use with Spell Correction --> + <fieldType name="textSpell" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.ICUTokenizerFactory"/> + <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"/> + <filter class="solr.ICUFoldingFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/> + <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> + </analyzer> + </fieldType> + <!-- More advanced spell checking field. --> + <fieldType name="textSpellShingle" class="solr.TextField" positionIncrementGap="100"> + <analyzer type="index"> + <tokenizer class="solr.ICUTokenizerFactory"/> + <filter class="solr.ICUFoldingFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/> + <filter class="solr.ShingleFilterFactory" maxShingleSize="2" outputUnigrams="false"/> + <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> + </analyzer> + <analyzer type="query"> + <tokenizer class="solr.ICUTokenizerFactory"/> + <filter class="solr.ICUFoldingFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/> + <filter class="solr.ShingleFilterFactory" maxShingleSize="2" outputUnigrams="false"/> + <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> + </analyzer> + </fieldType> + <!-- Text Field for Normalized ISBN/ISSN Numbers - take first chunk of text + prior to whitespace, force to lowercase, strip non-ISBN/ISSN characters, + omit results that are empty after stripping. --> + <fieldType name="isn" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.PatternTokenizerFactory" pattern="^(\S*)\s*.*$" group="1"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.PatternReplaceFilterFactory" pattern="[^0-9x]" replacement="" replace="all"/> + <filter class="solr.LengthFilterFactory" min="1" max="100" /> + </analyzer> + </fieldType> + <fieldType name="date" class="solr.DateField" sortMissingLast="true" omitNorms="true"/> + </types> + <fields> + <!-- Required by Solr 4.x --> + <field name="_version_" type="long" indexed="true" stored="true"/> + <!-- Core Fields --> + <field name="id" type="string" indexed="true" stored="true"/> + <field name="fulltext" type="text" indexed="true" stored="true"/> + <field name="fulltext_unstemmed" type="textProper" indexed="true" stored="false"/> + <field name="description" type="text" indexed="true" stored="true"/> + <field name="description_unstemmed" type="textProper" indexed="true" stored="false"/> + <field name="keywords" type="text" indexed="true" stored="true" multiValued="true"/> + <field name="keywords_unstemmed" type="textProper" indexed="true" stored="false" multiValued="true"/> + <field name="spelling" type="textSpell" indexed="true" stored="true"/> + <field name="spellingShingle" type="textSpellShingle" indexed="true" stored="true" multiValued="true"/> + <field name="last_indexed" type="date" indexed="true" stored="true"/> + <field name="title" type="text" indexed="true" stored="true"/> + <field name="title_sort" type="string" indexed="true" stored="false"/> + <field name="url" type="string" indexed="false" stored="true"/> + <field name="url_keywords" type="text" indexed="true" stored="false"/> + <!-- Popularity of page - can be used in function query for weighting --> + <field name="use_count" type="sint" indexed="true" stored="true"/> + <!-- Facet Fields --> + <field name="category" type="textFacet" indexed="true" stored="true" multiValued="true"/> + <field name="linktype" type="textFacet" indexed="true" stored="true" multiValued="true"/> + <field name="subject" type="textFacet" indexed="true" stored="true" multiValued="true"/> + <!-- Dynamic fields for customization without schema modification --> + <dynamicField name="*_date" type="date" indexed="true" stored="true"/> + <dynamicField name="*_date_mv" type="date" indexed="true" stored="true" multiValued="true"/> + <dynamicField name="*_isn" type="isn" indexed="true" stored="true"/> + <dynamicField name="*_isn_mv" type="isn" indexed="true" stored="true" multiValued="true"/> + <dynamicField name="*_str" type="string" indexed="true" stored="true"/> + <dynamicField name="*_str_mv" type="string" indexed="true" stored="true" multiValued="true"/> + <dynamicField name="*_txt" type="text" indexed="true" stored="true"/> + <dynamicField name="*_txt_mv" type="text" indexed="true" stored="true" multiValued="true"/> + <dynamicField name="*_txtF" type="textFacet" indexed="true" stored="true"/> + <dynamicField name="*_txtF_mv" type="textFacet" indexed="true" stored="true" multiValued="true"/> + <dynamicField name="*_txtP" type="textProper" indexed="true" stored="true"/> + <dynamicField name="*_txtP_mv" type="textProper" indexed="true" stored="true" multiValued="true"/> + </fields> + <uniqueKey>id</uniqueKey> + <defaultSearchField>fulltext</defaultSearchField> + <!-- CopyFields for Spelling --> + <copyField source="title" dest="spellingShingle"/> + <!-- Copy title information to special field for sorting --> + <copyField source="title" dest="title_sort"/> + <!-- CopyFields to allow keyword searching within URL --> + <copyField source="url" dest="url_keywords"/> + <!-- CopyFields for unstemmed searching --> + <copyField source="fulltext" dest="fulltext_unstemmed"/> + <copyField source="description" dest="description_unstemmed"/> + <copyField source="keywords" dest="keywords_unstemmed"/> + <!-- CopyFields to treat category metadata as additional keywords --> + <copyField source="category" dest="keywords" /> + <copyField source="subject" dest="keywords" /> + <copyField source="linktype" dest="keywords" /> + <copyField source="category" dest="keywords_unstemmed" /> + <copyField source="subject" dest="keywords_unstemmed" /> + <copyField source="linktype" dest="keywords_unstemmed" /> + <!-- Default Boolean Operator --> + <solrQueryParser defaultOperator="AND"/> +</schema> diff --git a/solr/website/conf/solrconfig.xml b/solr/website/conf/solrconfig.xml new file mode 100644 index 00000000000..ffe23d65132 --- /dev/null +++ b/solr/website/conf/solrconfig.xml @@ -0,0 +1,676 @@ +<?xml version="1.0" encoding="UTF-8" ?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<config> + <!-- Set this to 'false' if you want solr to continue working after it has + encountered an severe configuration error. In a production environment, + you may want solr to keep working even if one handler is mis-configured. + + You may also set this to false using by setting the system property: + -Dsolr.abortOnConfigurationError=false + --> + <abortOnConfigurationError>${solr.abortOnConfigurationError:false}</abortOnConfigurationError> + + <!-- Controls what version of Lucene various components of Solr + adhere to. Generally, you want to use the latest version to + get all bug fixes and improvements. It is highly recommended + that you fully re-index after changing this setting as it can + affect both how text is indexed and queried. + --> + <luceneMatchVersion>LUCENE_42</luceneMatchVersion> + + <!-- Used to specify an alternate directory to hold all index data + other than the default ./data under the Solr home. + If replication is in use, this should match the replication configuration. --> + <dataDir>${solr.solr.home:./solr}/website</dataDir> + + + <indexConfig> + + <useCompoundFile>false</useCompoundFile> + + <mergeFactor>10</mergeFactor> + + + <!-- ramBufferSizeMB sets the amount of RAM that may be used by Lucene + indexing for buffering added documents and deletions before they are + flushed to the Directory. + maxBufferedDocs sets a limit on the number of documents buffered + before flushing. + If both ramBufferSizeMB and maxBufferedDocs is set, then + Lucene will flush based on whichever limit is hit first. --> + <ramBufferSizeMB>100</ramBufferSizeMB> + <!-- <maxBufferedDocs>1000</maxBufferedDocs> --> + + + <!-- Commit Deletion Policy + + Custom deletion policies can be specified here. The class must + implement org.apache.lucene.index.IndexDeletionPolicy. + + http://lucene.apache.org/java/3_5_0/api/core/org/apache/lucene/index/IndexDeletionPolicy.html + + The default Solr IndexDeletionPolicy implementation supports + deleting index commit points on number of commits, age of + commit point and optimized status. + + The latest commit point should always be preserved regardless + of the criteria. + --> + <deletionPolicy class="solr.SolrDeletionPolicy"> + <!-- Keep only optimized commit points --> + <str name="keepOptimizedOnly">false</str> + <!-- The maximum number of commit points to be kept --> + <str name="maxCommitsToKeep">1</str> + <!-- + Delete all commit points once they have reached the given age. + Supports DateMathParser syntax e.g. + + <str name="maxCommitAge">30MINUTES</str> + <str name="maxCommitAge">1DAY</str> + --> + </deletionPolicy> + + </indexConfig> + + <!-- Enables JMX if and only if an existing MBeanServer is found, use + this if you want to configure JMX through JVM parameters. Remove + this to disable exposing Solr configuration and statistics to JMX. + + If you want to connect to a particular server, specify the agentId + e.g. <jmx agentId="myAgent" /> + + If you want to start a new MBeanServer, specify the serviceUrl + e.g <jmx serviceurl="service:jmx:rmi:///jndi/rmi://localhost:9999/solr" /> + + For more details see http://wiki.apache.org/solr/SolrJmx + --> + <jmx /> + + <!-- the default high-performance update handler --> + <updateHandler class="solr.DirectUpdateHandler2"> + + + <!-- Enables a transaction log, used for real-time get, durability, and + and solr cloud replica recovery. The log can grow as big as + uncommitted changes to the index, so use of a hard autoCommit + is recommended (see below). + "dir" - the target directory for transaction logs, defaults to the + solr data directory. --> + <updateLog> + <str name="dir">${solr.ulog.dir:}</str> + </updateLog> + + <!-- A prefix of "solr." for class names is an alias that + causes solr to search appropriate packages, including + org.apache.solr.(search|update|request|core|analysis) + --> + + <!-- Perform a <commit/> automatically under certain conditions: + maxDocs - number of updates since last commit is greater than this + maxTime - oldest uncommited update (in ms) is this long ago + --> + <autoCommit> + <maxTime>15000</maxTime> + <openSearcher>false</openSearcher> + </autoCommit> + + <!-- The RunExecutableListener executes an external command. + exe - the name of the executable to run + dir - dir to use as the current working directory. default="." + wait - the calling thread waits until the executable returns. default="true" + args - the arguments to pass to the program. default=nothing + env - environment variables to set. default=nothing + --> + <!-- A postCommit event is fired after every commit or optimize command + <listener event="postCommit" class="solr.RunExecutableListener"> + <str name="exe">solr/bin/snapshooter</str> + <str name="dir">.</str> + <bool name="wait">true</bool> + <arr name="args"> <str>arg1</str> <str>arg2</str> </arr> + <arr name="env"> <str>MYVAR=val1</str> </arr> + </listener> + --> + <!-- A postOptimize event is fired only after every optimize command, useful + in conjunction with index distribution to only distribute optimized indicies + <listener event="postOptimize" class="solr.RunExecutableListener"> + <str name="exe">snapshooter</str> + <str name="dir">solr/bin</str> + <bool name="wait">true</bool> + </listener> + --> + + </updateHandler> + + + <query> + <!-- Maximum number of clauses in a boolean query... can affect + range or prefix queries that expand to big boolean + queries. An exception is thrown if exceeded. --> + <maxBooleanClauses>1024</maxBooleanClauses> + + + <!-- Cache used by SolrIndexSearcher for filters (DocSets), + unordered sets of *all* documents that match a query. + When a new searcher is opened, its caches may be prepopulated + or "autowarmed" using data from caches in the old searcher. + autowarmCount is the number of items to prepopulate. For LRUCache, + the autowarmed items will be the most recently accessed items. + Parameters: + class - the SolrCache implementation (currently only LRUCache) + size - the maximum number of entries in the cache + initialSize - the initial capacity (number of entries) of + the cache. (seel java.util.HashMap) + autowarmCount - the number of entries to prepopulate from + and old cache. + --> + <filterCache + class="solr.FastLRUCache" + size="300000" + initialSize="300000" + autowarmCount="50000"/> + + <!-- queryResultCache caches results of searches - ordered lists of + document ids (DocList) based on a query, a sort, and the range + of documents requested. --> + <queryResultCache + class="solr.LRUCache" + size="100000" + initialSize="100000" + autowarmCount="50000"/> + + <!-- documentCache caches Lucene Document objects (the stored fields for each document). + Since Lucene internal document ids are transient, this cache will not be autowarmed. --> + <documentCache + class="solr.LRUCache" + size="50000" + initialSize="50000" + autowarmCount="10000"/> + + <!-- If true, stored fields that are not requested will be loaded lazily. + + This can result in a significant speed improvement if the usual case is to + not load all stored fields, especially if the skipped fields are large compressed + text fields. + --> + <enableLazyFieldLoading>true</enableLazyFieldLoading> + + <!-- Example of a generic cache. These caches may be accessed by name + through SolrIndexSearcher.getCache(),cacheLookup(), and cacheInsert(). + The purpose is to enable easy caching of user/application level data. + The regenerator argument should be specified as an implementation + of solr.search.CacheRegenerator if autowarming is desired. --> + <!-- + <cache name="myUserCache" + class="solr.LRUCache" + size="4096" + initialSize="1024" + autowarmCount="1024" + regenerator="org.mycompany.mypackage.MyRegenerator" + /> + --> + + <!-- An optimization that attempts to use a filter to satisfy a search. + If the requested sort does not include score, then the filterCache + will be checked for a filter matching the query. If found, the filter + will be used as the source of document ids, and then the sort will be + applied to that. + <useFilterForSortedQuery>true</useFilterForSortedQuery> + --> + + <!-- An optimization for use with the queryResultCache. When a search + is requested, a superset of the requested number of document ids + are collected. For example, if a search for a particular query + requests matching documents 10 through 19, and queryWindowSize is 50, + then documents 0 through 49 will be collected and cached. Any further + requests in that range can be satisfied via the cache. --> + <queryResultWindowSize>50</queryResultWindowSize> + + <!-- Maximum number of documents to cache for any entry in the + queryResultCache. --> + <queryResultMaxDocsCached>200</queryResultMaxDocsCached> + + <!-- This entry enables an int hash representation for filters (DocSets) + when the number of items in the set is less than maxSize. For smaller + sets, this representation is more memory efficient, more efficient to + iterate over, and faster to take intersections. --> + <HashDocSet maxSize="3000" loadFactor="0.75"/> + + <!-- a newSearcher event is fired whenever a new searcher is being prepared + and there is a current searcher handling requests (aka registered). --> + <!-- QuerySenderListener takes an array of NamedList and executes a + local query request for each NamedList in sequence. --> + <listener event="newSearcher" class="solr.QuerySenderListener"> + <arr name="queries"> + <lst> + <str name="q">science art business engineering history</str> + <str name="start">0</str> + <str name="rows">10</str> + </lst> + </arr> + </listener> + + <!-- a firstSearcher event is fired whenever a new searcher is being + prepared but there is no current registered searcher to handle + requests or to gain autowarming data from. --> + <listener event="firstSearcher" class="solr.QuerySenderListener"> + <arr name="queries"> + <lst> + <str name="q">science art business engineering history</str> + <!-- + <str name="facet.field">format</str> + <str name="fq">format:book</str> + --> + </lst> + </arr> + </listener> + + <!-- If a search request comes in and there is no current registered searcher, + then immediately register the still warming searcher and use it. If + "false" then all requests will block until the first searcher is done + warming. --> + <useColdSearcher>false</useColdSearcher> + + <!-- Maximum number of searchers that may be warming in the background + concurrently. An error is returned if this limit is exceeded. Recommend + 1-2 for read-only slaves, higher for masters w/o cache warming. --> + <maxWarmingSearchers>2</maxWarmingSearchers> + + </query> + + <!-- + Let the dispatch filter handler /select?qt=XXX + handleSelect=true will use consistent error handling for /select and /update + handleSelect=false will use solr1.1 style error formatting + --> + <requestDispatcher handleSelect="true" > + <!--Make sure your system has some authentication before enabling remote streaming! --> + <requestParsers enableRemoteStreaming="true" + multipartUploadLimitInKB="2048000" + formdataUploadLimitInKB="2048"/> + + <!-- Set HTTP caching related parameters (for proxy caches and clients). + + To get the behaviour of Solr 1.2 (ie: no caching related headers) + use the never304="true" option and do not specify a value for + <cacheControl> + --> + <!-- <httpCaching never304="true"> --> + <httpCaching lastModifiedFrom="openTime" + etagSeed="Solr"> + <!-- lastModFrom="openTime" is the default, the Last-Modified value + (and validation against If-Modified-Since requests) will all be + relative to when the current Searcher was opened. + You can change it to lastModFrom="dirLastMod" if you want the + value to exactly corrispond to when the physical index was last + modified. + + etagSeed="..." is an option you can change to force the ETag + header (and validation against If-None-Match requests) to be + differnet even if the index has not changed (ie: when making + significant changes to your config file) + + lastModifiedFrom and etagSeed are both ignored if you use the + never304="true" option. + --> + <!-- If you include a <cacheControl> directive, it will be used to + generate a Cache-Control header, as well as an Expires header + if the value contains "max-age=" + + By default, no Cache-Control header is generated. + + You can use the <cacheControl> option even if you have set + never304="true" + --> + <!-- <cacheControl>max-age=30, public</cacheControl> --> + </httpCaching> + </requestDispatcher> + + + <!-- requestHandler plugins... incoming queries will be dispatched to the + correct handler based on the path or the qt (query type) param. + Names starting with a '/' are accessed with the a path equal to the + registered name. Names without a leading '/' are accessed with: + http://host/app/select?qt=name + If no qt is defined, the requestHandler that declares default="true" + will be used. + --> + <requestHandler name="standard" class="solr.StandardRequestHandler" default="true"> + <!-- default values for query parameters --> + <lst name="defaults"> + <str name="echoParams">explicit</str> + <!-- + <int name="rows">10</int> + <str name="fl">*</str> + <str name="version">2.1</str> + --> + <str name="spellcheck.extendedResults">true</str> + <str name="spellcheck.onlyMorePopular">true</str> + <str name="spellcheck.count">20</str> + </lst> + <arr name="last-components"> + <str>spellcheck</str> + </arr> + </requestHandler> + + <!-- the following handler will be used for eligible dismax searches defined + in web/conf/searchspecs.yaml. Searches relying on advanced features + incompatible with dismax will be sent to the standard handler instead. + You can use this handler definition to set global Dismax settings + (i.e. mm / bf). If you need different settings for different types of + searches (i.e. Title vs. Author), you can also configure individual + settings in the searchspecs.yaml file. + --> + <requestHandler name="dismax" class="solr.SearchHandler"> + <lst name="defaults"> + <str name="defType">dismax</str> + <str name="echoParams">explicit</str> + <str name="spellcheck.extendedResults">true</str> + <str name="spellcheck.onlyMorePopular">true</str> + <str name="spellcheck.count">20</str> + </lst> + <arr name="last-components"> + <str>spellcheck</str> + </arr> + </requestHandler> + + <searchComponent name="spellcheck" class="org.apache.solr.handler.component.SpellCheckComponent"> + <lst name="spellchecker"> + <str name="name">default</str> + <str name="field">spellingShingle</str> + <str name="accuracy">0.75</str> + <str name="spellcheckIndexDir">./spellShingle</str> + <str name="queryAnalyzerFieldType">textSpellShingle</str> + <str name="buildOnOptimize">true</str> + </lst> + <lst name="spellchecker"> + <str name="name">basicSpell</str> + <str name="field">spelling</str> + <str name="accuracy">0.75</str> + <str name="spellcheckIndexDir">./spellchecker</str> + <str name="queryAnalyzerFieldType">textSpell</str> + <str name="buildOnOptimize">true</str> + </lst> + </searchComponent> + <queryConverter name="queryConverter" class="org.apache.solr.spelling.SpellingQueryConverter"/> + + <!-- Search component for extracting terms (useful for sitemap generation) --> + <searchComponent name="term" class="org.apache.solr.handler.component.TermsComponent"> + </searchComponent> + + <!-- + + Search components are registered to SolrCore and used by Search Handlers + + By default, the following components are avaliable: + + <searchComponent name="query" class="org.apache.solr.handler.component.QueryComponent" /> + <searchComponent name="facet" class="org.apache.solr.handler.component.FacetComponent" /> + <searchComponent name="mlt" class="org.apache.solr.handler.component.MoreLikeThisComponent" /> + <searchComponent name="highlight" class="org.apache.solr.handler.component.HighlightComponent" /> + <searchComponent name="stats" class="org.apache.solr.handler.component.StatsComponent" /> + <searchComponent name="debug" class="org.apache.solr.handler.component.DebugComponent" /> + + If you register a searchComponent to one of the standard names, that will be used instead. + + --> + + <requestHandler name="/search" class="org.apache.solr.handler.component.SearchHandler"> + <lst name="defaults"> + <str name="echoParams">explicit</str> + </lst> + <!-- + By default, this will register the following components: + + <arr name="components"> + <str>query</str> + <str>facet</str> + <str>mlt</str> + <str>highlight</str> + <str>debug</str> + </arr> + + To insert handlers before or after the 'standard' components, use: + + <arr name="first-components"> + <str>first</str> + </arr> + + <arr name="last-components"> + <str>last</str> + </arr> + + --> + <arr name="last-components"> + <str>spellcheck</str> + <str>elevator</str> + </arr> + </requestHandler> + + <!-- Request handler to extract terms (useful for sitemap generation) --> + <requestHandler name="/term" class="org.apache.solr.handler.component.SearchHandler"> + <arr name="components"> + <str>term</str> + </arr> + </requestHandler> + + <searchComponent name="elevator" class="org.apache.solr.handler.component.QueryElevationComponent" > + <!-- pick a fieldType to analyze queries --> + <str name="queryFieldType">string</str> + <str name="config-file">elevate.xml</str> + </searchComponent> + + <requestHandler name="/elevate" class="org.apache.solr.handler.component.SearchHandler" startup="lazy"> + <lst name="defaults"> + <str name="echoParams">explicit</str> + </lst> + <arr name="last-components"> + <str>elevator</str> + </arr> + </requestHandler> + + + + <!-- Update request handler. + + Note: Since solr1.1 requestHandlers requires a valid content type header if posted in + the body. For example, curl now requires: -H 'Content-type:text/xml; charset=utf-8' + The response format differs from solr1.1 formatting and returns a standard error code. + + To enable solr1.1 behavior, remove the /update handler or change its path + --> + <requestHandler name="/update" class="solr.UpdateRequestHandler"> + <!-- See below for information on defining + updateRequestProcessorChains that can be used by name + on each Update Request + --> + <!-- + <lst name="defaults"> + <str name="update.chain">dedupe</str> + </lst> + --> + </requestHandler> + <requestHandler name="/update/json" class="solr.JsonUpdateRequestHandler"/> + + <!-- Field Analysis Request Handler + + RequestHandler that provides much the same functionality as + analysis.jsp. Provides the ability to specify multiple field + types and field names in the same request and outputs + index-time and query-time analysis for each of them. + + Request parameters are: + analysis.fieldname - field name whose analyzers are to be used + + analysis.fieldtype - field type whose analyzers are to be used + analysis.fieldvalue - text for index-time analysis + q (or analysis.q) - text for query time analysis + analysis.showmatch (true|false) - When set to true and when + query analysis is performed, the produced tokens of the + field value analysis will be marked as "matched" for every + token that is produces by the query analysis + --> + <requestHandler name="/analysis/field" + startup="lazy" + class="solr.FieldAnalysisRequestHandler" /> + + + <!-- Document Analysis Handler + + http://wiki.apache.org/solr/AnalysisRequestHandler + + An analysis handler that provides a breakdown of the analysis + process of provided documents. This handler expects a (single) + content stream with the following format: + + <docs> + <doc> + <field name="id">1</field> + <field name="name">The Name</field> + <field name="text">The Text Value</field> + </doc> + <doc>...</doc> + <doc>...</doc> + ... + </docs> + + Note: Each document must contain a field which serves as the + unique key. This key is used in the returned response to associate + an analysis breakdown to the analyzed document. + + Like the FieldAnalysisRequestHandler, this handler also supports + query analysis by sending either an "analysis.query" or "q" + request parameter that holds the query text to be analyzed. It + also supports the "analysis.showmatch" parameter which when set to + true, all field tokens that match the query tokens will be marked + as a "match". + --> + <requestHandler name="/analysis/document" + class="solr.DocumentAnalysisRequestHandler" + startup="lazy" /> + + + <!-- CSV update handler, loaded on demand --> + <requestHandler name="/update/csv" class="solr.CSVRequestHandler" startup="lazy" /> + + + <!-- + Admin Handlers - This will register all the standard admin RequestHandlers. Adding + this single handler is equivolent to registering: + + <requestHandler name="/admin/luke" class="org.apache.solr.handler.admin.LukeRequestHandler" /> + <requestHandler name="/admin/system" class="org.apache.solr.handler.admin.SystemInfoHandler" /> + <requestHandler name="/admin/plugins" class="org.apache.solr.handler.admin.PluginInfoHandler" /> + <requestHandler name="/admin/threads" class="org.apache.solr.handler.admin.ThreadDumpHandler" /> + <requestHandler name="/admin/properties" class="org.apache.solr.handler.admin.PropertiesRequestHandler" /> + <requestHandler name="/admin/file" class="org.apache.solr.handler.admin.ShowFileRequestHandler" > + + If you wish to hide files under ${solr.home}/conf, explicitly register the ShowFileRequestHandler using: + <requestHandler name="/admin/file" class="org.apache.solr.handler.admin.ShowFileRequestHandler" > + <lst name="invariants"> + <str name="hidden">synonyms.txt</str> + <str name="hidden">anotherfile.txt</str> + </lst> + </requestHandler> + --> + <requestHandler name="/admin/" class="org.apache.solr.handler.admin.AdminHandlers" /> + + <!-- ping/healthcheck --> + <requestHandler name="/admin/ping" class="PingRequestHandler"> + <lst name="defaults"> + <str name="qt">standard</str> + <str name="q">solrpingquery</str> + <str name="echoParams">all</str> + </lst> + </requestHandler> + + <!-- Echo the request contents back to the client --> + <requestHandler name="/debug/dump" class="solr.DumpRequestHandler" > + <lst name="defaults"> + <str name="echoParams">explicit</str> <!-- for all params (including the default etc) use: 'all' --> + <str name="echoHandler">true</str> + </lst> + </requestHandler> + + <searchComponent class="solr.HighlightComponent" name="highlight"> + <highlighting> + <!-- Configure the standard fragmenter --> + <!-- This could most likely be commented out in the "default" case --> + <fragmenter name="gap" class="org.apache.solr.highlight.GapFragmenter" default="true"> + <lst name="defaults"> + <int name="hl.fragsize">100</int> + </lst> + </fragmenter> + + <!-- A regular-expression-based fragmenter (f.i., for sentence extraction) --> + <fragmenter name="regex" class="org.apache.solr.highlight.RegexFragmenter"> + <lst name="defaults"> + <!-- slightly smaller fragsizes work better because of slop --> + <int name="hl.fragsize">70</int> + <!-- allow 50% slop on fragment sizes --> + <float name="hl.regex.slop">0.5</float> + <!-- a basic sentence pattern --> + <str name="hl.regex.pattern">[-\w ,/\n\"']{20,200}</str> + </lst> + </fragmenter> + + <!-- Configure the standard formatter --> + <formatter name="html" class="org.apache.solr.highlight.HtmlFormatter" default="true"> + <lst name="defaults"> + <str name="hl.simple.pre"><![CDATA[<em>]]></str> + <str name="hl.simple.post"><![CDATA[</em>]]></str> + </lst> + </formatter> + </highlighting> + </searchComponent> + + <!-- queryResponseWriter plugins... query responses will be written using the + writer specified by the 'wt' request parameter matching the name of a registered + writer. + The "default" writer is the default and will be used if 'wt' is not specified + in the request. XMLResponseWriter will be used if nothing is specified here. + The json, python, and ruby writers are also available by default. + + <queryResponseWriter name="xml" class="org.apache.solr.request.XMLResponseWriter" default="true"/> + <queryResponseWriter name="json" class="org.apache.solr.request.JSONResponseWriter"/> + <queryResponseWriter name="python" class="org.apache.solr.request.PythonResponseWriter"/> + <queryResponseWriter name="ruby" class="org.apache.solr.request.RubyResponseWriter"/> + <queryResponseWriter name="php" class="org.apache.solr.request.PHPResponseWriter"/> + <queryResponseWriter name="phps" class="org.apache.solr.request.PHPSerializedResponseWriter"/> + + <queryResponseWriter name="custom" class="com.example.MyResponseWriter"/> + --> + + <!-- XSLT response writer transforms the XML output by any xslt file found + in Solr's conf/xslt directory. Changes to xslt files are checked for + every xsltCacheLifetimeSeconds. + --> + <queryResponseWriter name="xslt" class="org.apache.solr.response.XSLTResponseWriter"> + <int name="xsltCacheLifetimeSeconds">5</int> + </queryResponseWriter> + + <!-- config for the admin interface --> + <admin> + <defaultQuery>shakespeare</defaultQuery> + + <!-- configure a healthcheck file for servers behind a loadbalancer + <healthcheck type="file">server-enabled</healthcheck> + --> + </admin> + +</config> diff --git a/solr/website/conf/stopwords.txt b/solr/website/conf/stopwords.txt new file mode 100644 index 00000000000..40a5a5fa2f1 --- /dev/null +++ b/solr/website/conf/stopwords.txt @@ -0,0 +1,34 @@ +an +and +are +as +at +be +but +by +for +if +in +into +is +it +no +not +of +on +or +s +such +t +that +the +their +then +there +these +they +this +to +was +will +with \ No newline at end of file diff --git a/solr/website/conf/synonyms.txt b/solr/website/conf/synonyms.txt new file mode 100644 index 00000000000..29b05690f1a --- /dev/null +++ b/solr/website/conf/synonyms.txt @@ -0,0 +1,5 @@ +I,i,1,one,One +II,ii,2,two,Two +III,iii,3,three,Three +IV,iv,4,four,Four +V,v,5,five,Five \ No newline at end of file diff --git a/themes/blueprint/templates/RecordDriver/SolrWeb/result-list.phtml b/themes/blueprint/templates/RecordDriver/SolrWeb/result-list.phtml new file mode 100644 index 00000000000..5eb814bac99 --- /dev/null +++ b/themes/blueprint/templates/RecordDriver/SolrWeb/result-list.phtml @@ -0,0 +1,33 @@ +<? + $url = $this->driver->getUrl(); +?> +<div class="listentry span-15"> + <div class="resultItemLine1"> + <a href="<?=$this->escapeHtml($url)?>" class="title"><? + $summHighlightedTitle = $this->driver->getHighlightedTitle(); + $summTitle = $this->driver->getTitle(); + if (!empty($summHighlightedTitle)) { + echo $this->highlight($this->addEllipsis($summHighlightedTitle, $summTitle)); + } else if (!empty($summTitle)) { + echo $this->escapeHtml($this->truncate($summTitle, 180)); + } else { + echo $this->transEsc('Title not available'); + } + ?></a> + </div> + + <div class="resultItemLine2"> + <? $snippet = $this->driver->getHighlightedSnippet(); ?> + <? $summary = $this->driver->getSummary(); ?> + <? if (!empty($snippet)): ?> + <?=$this->highlight($snippet['snippet'])?> + <? elseif (!empty($summary)): ?> + <?=$this->escapeHtml($summary[0])?> + <? endif; ?> + </div> + + <div class="resultItemLine3"> + <span style="color:#008000;" class="ui-li-desc"><?=$this->escapeHtml($url)?></span> + </div> +</div> +<div class="clearer"></div> \ No newline at end of file diff --git a/themes/blueprint/templates/web/home.phtml b/themes/blueprint/templates/web/home.phtml new file mode 100644 index 00000000000..d13d4348c1e --- /dev/null +++ b/themes/blueprint/templates/web/home.phtml @@ -0,0 +1 @@ +<?=$this->render('search/home.phtml');?> \ No newline at end of file diff --git a/themes/blueprint/templates/web/results.phtml b/themes/blueprint/templates/web/results.phtml new file mode 100644 index 00000000000..dd801a58a86 --- /dev/null +++ b/themes/blueprint/templates/web/results.phtml @@ -0,0 +1,5 @@ +<? + // Load standard settings from the default search results screen: + $this->hideCartControls = true; + echo $this->render('search/results.phtml'); +?> \ No newline at end of file diff --git a/themes/jquerymobile/templates/RecordDriver/SolrWeb/result-list.phtml b/themes/jquerymobile/templates/RecordDriver/SolrWeb/result-list.phtml new file mode 100644 index 00000000000..41e08d2e44e --- /dev/null +++ b/themes/jquerymobile/templates/RecordDriver/SolrWeb/result-list.phtml @@ -0,0 +1,26 @@ +<? + $url = $this->driver->getUrl(); +?> +<a rel="external" href="<?=$this->escapeHtml($url)?>"> + <div class="result"> + <h3><? + $summHighlightedTitle = $this->driver->getHighlightedTitle(); + $summTitle = $this->driver->getTitle(); + if (!empty($summHighlightedTitle)) { + echo $this->highlight($this->addEllipsis($summHighlightedTitle, $summTitle)); + } else if (!empty($summTitle)) { + echo $this->escapeHtml($this->truncate($summTitle, 180)); + } else { + echo $this->transEsc('Title not available'); + } + ?></h3> + </div> + <? $snippet = $this->driver->getHighlightedSnippet(); ?> + <? $summary = $this->driver->getSummary(); ?> + <? if (!empty($snippet)): ?> + <p><?=$this->highlight($snippet['snippet'])?></p> + <? elseif (!empty($summary)): ?> + <p><?=$this->escapeHtml($summary[0])?></p> + <? endif; ?> + <span style="color:#008000;" class="ui-li-desc"><?=$this->escapeHtml($url)?></span> +</a> diff --git a/themes/jquerymobile/templates/web/home.phtml b/themes/jquerymobile/templates/web/home.phtml new file mode 100644 index 00000000000..881bf39a2f6 --- /dev/null +++ b/themes/jquerymobile/templates/web/home.phtml @@ -0,0 +1,4 @@ +<? + $this->overrideHomeOptions = ''; // turn off extra options + echo $this->render('search/home.phtml'); +?> \ No newline at end of file diff --git a/themes/jquerymobile/templates/web/results.phtml b/themes/jquerymobile/templates/web/results.phtml new file mode 100644 index 00000000000..9c5b81545ee --- /dev/null +++ b/themes/jquerymobile/templates/web/results.phtml @@ -0,0 +1,7 @@ +<? + // Override top search link: + $this->searchLink = 'web-home'; + + // Load standard settings from the default search results screen: + echo $this->render('search/results.phtml'); +?> \ No newline at end of file -- GitLab