diff --git a/config/vufind/fulltext.ini b/config/vufind/fulltext.ini index c9bad66037a731c8d2fde991d9c175f02da3196b..7a057fb5ac525059a8f1e49289fd92db051df857 100644 --- a/config/vufind/fulltext.ini +++ b/config/vufind/fulltext.ini @@ -2,6 +2,12 @@ ; on the XSLT import tool for more details: ; http://vufind.org/wiki/importing_records#importing_with_xslt +; Use this setting to set your default parser (either Aperture or Tika); if you omit +; this setting, VuFind will attempt to auto-detect the first available uncommented +; tool below. +;[General] +;parser = Tika + ; Aperture is a Java tool for extracting full text from documents. It is not ; included with VuFind by default, but it can be downloaded here: ; http://aperture.sourceforge.net/ @@ -11,3 +17,11 @@ ; and fill in the appropriate path to take advantage of it. ;webcrawler = "/usr/local/aperture/bin/webcrawler.sh" ; Linux ;webcrawler = "c:\aperture\bin\webcrawler.bat" ; Windows + +; Tika is another Java tool for extracting fulltext from documents It is not +; included with VuFind by default, but it can be downloaded here: +; http://tika.apache.org/download.html +; VuFind's Tika code was tested with version 1.2 of Tika. +[Tika] +; Download the jar file and fill in the appropriate path to use it. +;path = "/usr/local/tika/tika.jar" \ No newline at end of file diff --git a/import/xsl/nlm_ojs.xsl b/import/xsl/nlm_ojs.xsl index a8f2087f527ef494c0dd94e9a8ba86411261a2a5..9528c0b9910395172a9b57ea19fd1ee34dfbdd5e 100644 --- a/import/xsl/nlm_ojs.xsl +++ b/import/xsl/nlm_ojs.xsl @@ -206,7 +206,7 @@ <xsl:otherwise> <xsl:for-each select="//nlm:self-uri[@content-type="application/pdf"]"> <field name="fulltext"> - <xsl:value-of select="php:function('VuFind::harvestWithAperture', string(./@xlink:href))"/> + <xsl:value-of select="php:function('VuFind::harvestWithParser', string(./@xlink:href))"/> </field> </xsl:for-each> </xsl:otherwise> diff --git a/import/xsl/vudl.xsl b/import/xsl/vudl.xsl index fd118d900299b933b64ddf222a2a5f5bcaedc531..daadcf4882ca07bae49ef780b5d6f4f747e7cdc4 100644 --- a/import/xsl/vudl.xsl +++ b/import/xsl/vudl.xsl @@ -143,7 +143,7 @@ <xsl:value-of select="php:function('VuFind::harvestTextFile', string(./@xlink:href))"/> </xsl:for-each> <xsl:for-each select="//METS:fileGrp[@USE="TRANSCRIPTION"]/METS:file/METS:FLocat"> - <xsl:value-of select="php:function('VuFind::harvestWithAperture', string(./@xlink:href))"/> + <xsl:value-of select="php:function('VuFind::harvestWithParser', string(./@xlink:href))"/> </xsl:for-each> </field> </doc> diff --git a/module/VuFind/src/VuFind/XSLT/Import/VuFind.php b/module/VuFind/src/VuFind/XSLT/Import/VuFind.php index 88a5e5a9db6953c51390b9e1032bf459602bb1e8..03e4930f74b362876f5b3376cfeeae8bd9248a99 100644 --- a/module/VuFind/src/VuFind/XSLT/Import/VuFind.php +++ b/module/VuFind/src/VuFind/XSLT/Import/VuFind.php @@ -74,7 +74,6 @@ class VuFind * @param string $date Date record was last modified. * * @return string First index date/time. - * @access public */ public static function getFirstIndexed($core, $id, $date) { @@ -92,7 +91,6 @@ class VuFind * @param string $date Date record was last modified. * * @return string Latest index date/time. - * @access public */ public static function getLastIndexed($core, $id, $date) { @@ -108,7 +106,6 @@ class VuFind * @param string $url URL of file to retrieve. * * @return string file contents. - * @access public */ public static function harvestTextFile($url) { @@ -125,30 +122,105 @@ class VuFind } /** - * Harvest the contents of a document file (PDF, Word, etc.) using Aperture. - * This method will only work if Aperture is properly configured in the - * fulltext.ini file. Without proper configuration, this will simply return - * an empty string. + * Read parser method from fulltext.ini * - * @param string $url URL of file to retrieve. + * @return string Name of parser to use (i.e. Aperture or Tika) + */ + public static function getParser() + { + $settings = ConfigReader::getConfig('fulltext'); + + // Is user preference explicitly set? + if (isset($settings->General->parser)) { + return $settings->General->parser; + } + + // Is Aperture enabled? + if (isset($settings->Aperture->webcrawler)) { + return 'Aperture'; + } + + // Is Tika enabled? + if (isset($settings->Tika->path)) { + return 'Tika'; + } + + // If we got this far, no parser is available: + return 'None'; + } + + /** + * Call parsing method based on parser setting in fulltext.ini * - * @return string text contents of file. - * @access public + * @param string $url URL to harvest + * + * @return string Text contents of URL */ - public static function harvestWithAperture($url) + public static function harvestWithParser($url) { - // Determine the base Aperture command (or fail if it is not configured): + $parser = self::getParser(); + switch (strtolower($parser)) { + case 'aperture': + return self::harvestWithAperture($url); + case 'tika': + return self::harvestWithTika($url); + default: + // Ignore unrecognized parser option: + return ''; + } + } + + /** + * Generic method for building Aperture Command + * + * @param string $input name of input file | url + * @param string $output name of output file + * @param string $method webcrawler | filecrawler + * + * @return string command to be executed + */ + public static function getApertureCommand($input, $output, + $method = "webcrawler" + ) { + // get the path to our sh/bat from the config $settings = ConfigReader::getConfig('fulltext'); if (!isset($settings->Aperture->webcrawler)) { return ''; } $cmd = $settings->Aperture->webcrawler; + // if we're using another method - substitute that into the path + $cmd = ($method != "webcrawler") + ? str_replace('webcrawler', $method, $cmd) : $cmd; + + // return the full command + return "{$cmd} -o {$output} -x {$input}"; + } + + /** + * Harvest the contents of a document file (PDF, Word, etc.) using Aperture. + * This method will only work if Aperture is properly configured in the + * fulltext.ini file. Without proper configuration, this will simply return an + * empty string. + * + * @param string $url URL of file to retrieve. + * @param string $method webcrawler | filecrawler + * + * @return string text contents of file. + */ + public static function harvestWithAperture($url, $method = "webcrawler") + { // Build a filename for temporary XML storage: $xmlFile = tempnam('/tmp', 'apt'); + // Determine the base Aperture command (or fail if it is not configured): + $aptCmd = self::getApertureCommand($url, $xmlFile, $method); + if (empty($aptCmd)) { + return ''; + } + // Call Aperture: - exec("$cmd -o $xmlFile -x $url"); + exec($aptCmd); // If we failed to process the file, give up now: if (!file_exists($xmlFile)) { @@ -168,6 +240,66 @@ class VuFind return preg_replace($badChars, ' ', $final); } + /** + * Generic method for building Tika command + * + * @param string $input url | fileresource + * @param string $output name of output file + * @param string $arg optional Tika arguments + * + * @return array Parameters for proc_open command + */ + public static function getTikaCommand($input, $output, $arg) + { + $settings = ConfigReader::getConfig('fulltext'); + if (!isset($settings->Tika->path)) { + return ''; + } + $tika = $settings->Tika->path; + + // We need to use this method to get the output from STDOUT into the file + $descriptorspec = array( + 0 => array('pipe', 'r'), + 1 => array('file', $output, 'w'), + 2 => array('pipe', 'w') + ); + return array( + "java -jar $tika $arg -eUTF8 $input", $descriptorspec, array() + ); + } + + /** + * Harvest the contents of a document file (PDF, Word, etc.) using Tika. + * This method will only work if Tika is properly configured in the + * fulltext.ini file. Without proper configuration, this will simply return an + * empty string. + * + * @param string $url URL of file to retrieve. + * @param string $arg optional argument(s) for Tika + * + * @return string text contents of file. + */ + public static function harvestWithTika($url, $arg = "--text") + { + // Build a filename for temporary XML storage: + $outputFile = tempnam('/tmp', 'tika'); + + // Determine the base Tika command and execute + $tikaCommand = self::getTikaCommand($url, $outputFile, $arg); + proc_close(proc_open($tikaCommand[0], $tikaCommand[1], $tikaCommand[2])); + + // If we failed to process the file, give up now: + if (!file_exists($outputFile)) { + return ''; + } + + // Extract and decode the full text from the XML: + $txt = file_get_contents($outputFile); + @unlink($outputFile); + + return $txt; + } + /** * Map string using a config file from the translation_maps folder. * @@ -175,7 +307,6 @@ class VuFind * @param string $filename filename of map file * * @return string mapped text. - * @access public */ public static function mapString($in, $filename) { @@ -201,7 +332,6 @@ class VuFind * @param string $in title to process. * * @return string article-stripped text. - * @access public */ public static function stripArticles($in) { @@ -226,7 +356,6 @@ class VuFind * @param array $in array of DOMElement objects. * * @return string XML as string - * @access public */ public static function xmlAsText($in) { @@ -261,7 +390,6 @@ class VuFind * @param string $tag name of tag to remove * * @return string XML as string - * @access public */ public static function removeTagAndReturnXMLasText($in, $tag) { @@ -287,7 +415,6 @@ class VuFind * @param string $string String to split * * @return DOMDocument - * @access public */ public static function explode($delimiter, $string) {