diff --git a/harvest/batch-delete.bat b/harvest/batch-delete.bat new file mode 100644 index 0000000000000000000000000000000000000000..d73276833546d80a63902b6fa2e93b8af6de9bc0 --- /dev/null +++ b/harvest/batch-delete.bat @@ -0,0 +1,67 @@ +@echo off +rem Make sure that environment edits are local and that we have access to the +rem Windows command extensions. +setlocal enableextensions +if not errorlevel 1 goto extensionsokay +echo Unable to enable Windows command extensions. +goto end +:extensionsokay + +rem Make sure VUFIND_HOME is set: +if not "!%VUFIND_HOME%!"=="!!" goto vufindhomefound +rem VUFIND_HOME not set -- try to call vufind.bat to +rem fix the problem before we give up completely +if exist %0\..\..\vufind.bat goto usevufindbat +rem If vufind.bat doesn't exist, the user hasn't run install.bat yet. +echo ERROR: vufind.bat does not exist -- could not set up environment. +echo Please run install.bat to correct this problem. +goto end +:usevufindbat +cd %0\..\.. +call vufind > nul +cd %0\.. +if not "!%VUFIND_HOME%!"=="!!" goto vufindhomefound +echo You need to set the VUFIND_HOME environmental variable before running this script. +goto end +:vufindhomefound + +rem Make sure command line parameter was included: +if not "!%1!"=="!!" goto paramsokay +echo This script deletes records based on files created by the OAI-PMH harvester. +echo. +echo Usage: %0 [harvest subdirectory] [index type] +echo. +echo [harvest subdirectory] is a directory name created by the OAI-PMH harvester. +echo This script will search the harvest subdirectories of the directories defined +echo by the VUFIND_LOCAL_DIR and VUFIND_HOME environment variables. +echo. +echo [index type] is optional; defaults to Solr for main bibliographic index, but +echo can be set to SolrAuth for authority index. +echo. +echo Example: %0 oai_source +goto end +:paramsokay + +rem Check if the path is valid: +set BASEPATH="%VUFIND_LOCAL_DIR%\harvest\%1" +if exist %BASEPATH% goto basepathfound +set BASEPATH="%VUFIND_HOME%\harvest\%1" +if exist %BASEPATH% goto basepathfound +echo Directory %BASEPATH% does not exist! +goto end +:basepathfound + +rem Create log/processed directories as needed: +if exist %BASEPATH%\processed goto processedfound +md %BASEPATH%\processed +:processedfound + +rem Process all the files in the target directory: +cd %VUFIND_HOME%\util +for %%a in (%BASEPATH%\*.delete) do ( + echo Processing %%a... + php deletes.php %%a flat %2 + move %%a %BASEPATH%\processed\ > nul +) + +:end \ No newline at end of file diff --git a/harvest/batch-delete.sh b/harvest/batch-delete.sh new file mode 100644 index 0000000000000000000000000000000000000000..d0f14464c4b1ff342c59904a80c0be0b0c4c62cf --- /dev/null +++ b/harvest/batch-delete.sh @@ -0,0 +1,56 @@ +#!/bin/sh + +# Make sure VUFIND_HOME is set: +if [ -z "$VUFIND_HOME" ] +then + echo "Please set the VUFIND_HOME environment variable." + exit 1 +fi + +# Make sure command line parameter was included: +if [ -z "$1" ] +then + echo "This script deletes records based on files created by the OAI-PMH harvester."; + echo "" + echo "Usage: `basename $0` [harvest subdirectory] [index type]" + echo "" + echo "[harvest subdirectory] is a directory name created by the OAI-PMH harvester." + echo "This script will search the harvest subdirectories of the directories defined" + echo "by the VUFIND_LOCAL_DIR and VUFIND_HOME environment variables." + echo "" + echo "[index type] is optional; defaults to Solr for main bibliographic index, but" + echo "can be set to SolrAuth for authority index." + echo "" + echo "Example: `basename $0` oai_source" + exit 1 +fi + +# Check if the path is valid: +BASEPATH="$VUFIND_LOCAL_DIR/harvest/$1" +if [ ! -d $BASEPATH ] +then + BASEPATH="$VUFIND_HOME/harvest/$1" +fi +if [ ! -d $BASEPATH ] +then + echo "Directory $BASEPATH does not exist!" + exit 1 +fi + +# Create log/processed directories as needed: +if [ ! -d $BASEPATH/processed ] +then + mkdir $BASEPATH/processed +fi + +# Process all the files in the target directory: +cd $VUFIND_HOME/util +for file in $BASEPATH/*.delete +do + if [ -f $file ] + then + echo "Processing $file ..." + php deletes.php $file flat $2 + mv $file $BASEPATH/processed/`basename $file` + fi +done diff --git a/harvest/batch-import-marc-auth.bat b/harvest/batch-import-marc-auth.bat new file mode 100644 index 0000000000000000000000000000000000000000..6532044cf35ce46dff706df5a8919b26272dbea1 --- /dev/null +++ b/harvest/batch-import-marc-auth.bat @@ -0,0 +1,66 @@ +@echo off +rem Make sure that environment edits are local and that we have access to the +rem Windows command extensions. +setlocal enableextensions +if not errorlevel 1 goto extensionsokay +echo Unable to enable Windows command extensions. +goto end +:extensionsokay + +rem Make sure VUFIND_HOME is set: +if not "!%VUFIND_HOME%!"=="!!" goto vufindhomefound +rem VUFIND_HOME not set -- try to call vufind.bat to +rem fix the problem before we give up completely +if exist %0\..\..\vufind.bat goto usevufindbat +rem If vufind.bat doesn't exist, the user hasn't run install.bat yet. +echo ERROR: vufind.bat does not exist -- could not set up environment. +echo Please run install.bat to correct this problem. +goto end +:usevufindbat +cd %0\..\.. +call vufind > nul +cd %0\.. +if not "!%VUFIND_HOME%!"=="!!" goto vufindhomefound +echo You need to set the VUFIND_HOME environmental variable before running this script. +goto end +:vufindhomefound + +rem Make sure command line parameter was included: +if not "!%2!"=="!!" goto paramsokay +echo This script processes a batch of harvested authority records. +echo. +echo Usage: %0 [harvest subdirectory] [SolrMarc properties file] +echo. +echo [harvest subdirectory] is a directory name created by the OAI-PMH harvester. +echo This script will search the harvest subdirectories of the directories defined +echo by the VUFIND_LOCAL_DIR and VUFIND_HOME environment variables. +echo. +echo Example: %0 lcnaf marc_lcnaf.properties +goto end +:paramsokay + +rem Check if the path is valid: +set BASEPATH="%VUFIND_LOCAL_DIR%\harvest\%1" +if exist %BASEPATH% goto basepathfound +set BASEPATH="%VUFIND_HOME%\harvest\%1" +if exist %BASEPATH% goto basepathfound +echo Directory %BASEPATH% does not exist! +goto end +:basepathfound + +rem Create log/processed directories as needed: +if exist %BASEPATH%\log goto logfound +md %BASEPATH%\log +:logfound +if exist %BASEPATH%\processed goto processedfound +md %BASEPATH%\processed +:processedfound + +rem Process all the files in the target directory: +for %%a in (%BASEPATH%\*.xml %BASEPATH%\*.mrc) do ( + echo Processing %%a... + call %VUFIND_HOME%\import-marc-auth.bat %%a %2 > %BASEPATH%\log\%%~nxa.log + move %%a %BASEPATH%\processed\ > nul +) + +:end \ No newline at end of file diff --git a/harvest/batch-import-marc-auth.sh b/harvest/batch-import-marc-auth.sh new file mode 100644 index 0000000000000000000000000000000000000000..3537a146eb097b790d1e6d7338a059aa5e4cb818 --- /dev/null +++ b/harvest/batch-import-marc-auth.sh @@ -0,0 +1,56 @@ +#!/bin/sh + +# Make sure VUFIND_HOME is set: +if [ -z "$VUFIND_HOME" ] +then + echo "Please set the VUFIND_HOME environment variable." + exit 1 +fi + +# Make sure command line parameter was included: +if [ -z "$2" ] +then + echo "This script processes a batch of harvested authority records." + echo "" + echo "Usage: `basename $0` [harvest subdirectory] [SolrMarc properties file]" + echo "" + echo "[harvest subdirectory] is a directory name created by the OAI-PMH harvester." + echo "This script will search the harvest subdirectories of the directories defined" + echo "by the VUFIND_LOCAL_DIR and VUFIND_HOME environment variables." + echo "" + echo "Example: `basename $0` lcnaf marc_lcnaf.properties" + exit 1 +fi + +# Check if the path is valid: +BASEPATH="$VUFIND_LOCAL_DIR/harvest/$1" +if [ ! -d $BASEPATH ] +then + BASEPATH="$VUFIND_HOME/harvest/$1" +fi +if [ ! -d $BASEPATH ] +then + echo "Directory $BASEPATH does not exist!" + exit 1 +fi + +# Create log/processed directories as needed: +if [ ! -d $BASEPATH/log ] +then + mkdir $BASEPATH/log +fi +if [ ! -d $BASEPATH/processed ] +then + mkdir $BASEPATH/processed +fi + +# Process all the files in the target directory: +for file in $BASEPATH/*.xml $BASEPATH/*.mrc +do + if [ -f $file ] + then + echo "Processing $file ..." + $VUFIND_HOME/import-marc-auth.sh $file $2 > $BASEPATH/log/`basename $file`.log + mv $file $BASEPATH/processed/`basename $file` + fi +done diff --git a/harvest/batch-import-marc.bat b/harvest/batch-import-marc.bat new file mode 100644 index 0000000000000000000000000000000000000000..6dda9378e7d7634947769d9f2b81a575004dd7d5 --- /dev/null +++ b/harvest/batch-import-marc.bat @@ -0,0 +1,66 @@ +@echo off +rem Make sure that environment edits are local and that we have access to the +rem Windows command extensions. +setlocal enableextensions +if not errorlevel 1 goto extensionsokay +echo Unable to enable Windows command extensions. +goto end +:extensionsokay + +rem Make sure VUFIND_HOME is set: +if not "!%VUFIND_HOME%!"=="!!" goto vufindhomefound +rem VUFIND_HOME not set -- try to call vufind.bat to +rem fix the problem before we give up completely +if exist %0\..\..\vufind.bat goto usevufindbat +rem If vufind.bat doesn't exist, the user hasn't run install.bat yet. +echo ERROR: vufind.bat does not exist -- could not set up environment. +echo Please run install.bat to correct this problem. +goto end +:usevufindbat +cd %0\..\.. +call vufind > nul +cd %0\.. +if not "!%VUFIND_HOME%!"=="!!" goto vufindhomefound +echo You need to set the VUFIND_HOME environmental variable before running this script. +goto end +:vufindhomefound + +rem Make sure command line parameter was included: +if not "!%1!"=="!!" goto paramsokay +echo This script processes a batch of harvested MARC records. +echo. +echo Usage: %0 [harvest subdirectory] +echo. +echo [harvest subdirectory] is a directory name created by the OAI-PMH harvester. +echo This script will search the harvest subdirectories of the directories defined +echo by the VUFIND_LOCAL_DIR and VUFIND_HOME environment variables. +echo. +echo Example: %0 oai_source +goto end +:paramsokay + +rem Check if the path is valid: +set BASEPATH="%VUFIND_LOCAL_DIR%\harvest\%1" +if exist %BASEPATH% goto basepathfound +set BASEPATH="%VUFIND_HOME%\harvest\%1" +if exist %BASEPATH% goto basepathfound +echo Directory %BASEPATH% does not exist! +goto end +:basepathfound + +rem Create log/processed directories as needed: +if exist %BASEPATH%\log goto logfound +md %BASEPATH%\log +:logfound +if exist %BASEPATH%\processed goto processedfound +md %BASEPATH%\processed +:processedfound + +rem Process all the files in the target directory: +for %%a in (%BASEPATH%\*.xml %BASEPATH%\*.mrc) do ( + echo Processing %%a... + call %VUFIND_HOME%\import-marc.bat %%a > %BASEPATH%\log\%%~nxa.log + move %%a %BASEPATH%\processed\ > nul +) + +:end \ No newline at end of file diff --git a/harvest/batch-import-marc.sh b/harvest/batch-import-marc.sh new file mode 100644 index 0000000000000000000000000000000000000000..ed8299f63314962a982200bb195ccdb6e3171acb --- /dev/null +++ b/harvest/batch-import-marc.sh @@ -0,0 +1,56 @@ +#!/bin/sh + +# Make sure VUFIND_HOME is set: +if [ -z "$VUFIND_HOME" ] +then + echo "Please set the VUFIND_HOME environment variable." + exit 1 +fi + +# Make sure command line parameter was included: +if [ -z "$1" ] +then + echo "This script processes a batch of harvested MARC records." + echo "" + echo "Usage: `basename $0` [harvest subdirectory]" + echo "" + echo "[harvest subdirectory] is a directory name created by the OAI-PMH harvester." + echo "This script will search the harvest subdirectories of the directories defined" + echo "by the VUFIND_LOCAL_DIR and VUFIND_HOME environment variables." + echo "" + echo "Example: `basename $0` oai_source" + exit 1 +fi + +# Check if the path is valid: +BASEPATH="$VUFIND_LOCAL_DIR/harvest/$1" +if [ ! -d $BASEPATH ] +then + BASEPATH="$VUFIND_HOME/harvest/$1" +fi +if [ ! -d $BASEPATH ] +then + echo "Directory $BASEPATH does not exist!" + exit 1 +fi + +# Create log/processed directories as needed: +if [ ! -d $BASEPATH/log ] +then + mkdir $BASEPATH/log +fi +if [ ! -d $BASEPATH/processed ] +then + mkdir $BASEPATH/processed +fi + +# Process all the files in the target directory: +for file in $BASEPATH/*.xml $BASEPATH/*.mrc +do + if [ -f $file ] + then + echo "Processing $file ..." + $VUFIND_HOME/import-marc.sh $file > $BASEPATH/log/`basename $file`.log + mv $file $BASEPATH/processed/`basename $file` + fi +done diff --git a/harvest/batch-import-xsl.bat b/harvest/batch-import-xsl.bat new file mode 100644 index 0000000000000000000000000000000000000000..c057810833cbf6754a00f9019abc2ce65b14b5d3 --- /dev/null +++ b/harvest/batch-import-xsl.bat @@ -0,0 +1,84 @@ +@echo off +rem Make sure that environment edits are local and that we have access to the +rem Windows command extensions. +setlocal enableextensions +if not errorlevel 1 goto extensionsokay +echo Unable to enable Windows command extensions. +goto end +:extensionsokay + +rem Make sure VUFIND_HOME is set: +if not "!%VUFIND_HOME%!"=="!!" goto vufindhomefound +rem VUFIND_HOME not set -- try to call vufind.bat to +rem fix the problem before we give up completely +if exist %0\..\..\vufind.bat goto usevufindbat +rem If vufind.bat doesn't exist, the user hasn't run install.bat yet. +echo ERROR: vufind.bat does not exist -- could not set up environment. +echo Please run install.bat to correct this problem. +goto end +:usevufindbat +cd %0\..\.. +call vufind > nul +cd %0\.. +if not "!%VUFIND_HOME%!"=="!!" goto vufindhomefound +echo You need to set the VUFIND_HOME environmental variable before running this script. +goto end +:vufindhomefound + +rem Make sure command line parameter was included: +if not "!%2!"=="!!" goto paramsokay +echo This script processes a batch of harvested XML records using the specified XSL +echo import configuration file. +echo. +echo Usage: %0 [harvest subdirectory] [properties file] +echo. +echo [harvest subdirectory] is a directory name created by the OAI-PMH harvester. +echo This script will search the harvest subdirectories of the directories defined +echo by the VUFIND_LOCAL_DIR and VUFIND_HOME environment variables. +echo. +echo [properties file] is a configuration file found in the import subdirectory of +echo either your VUFIND_LOCAL_DIR or VUFIND_HOME directory. +echo. +echo Example: %0 oai_source ojs.properties +goto end +:paramsokay + +rem Check if the path is valid: +set BASEPATH="%VUFIND_LOCAL_DIR%\harvest\%1" +if exist %BASEPATH% goto basepathfound +set BASEPATH="%VUFIND_HOME%\harvest\%1" +if exist %BASEPATH% goto basepathfound +echo Directory %BASEPATH% does not exist! +goto end +:basepathfound + +rem Create log/processed directories as needed: +if exist %BASEPATH%\processed goto processedfound +md %BASEPATH%\processed +:processedfound + +rem Flag -- do we need to perform an optimize? +set OPTIMIZE=0 + +rem Process all the files in the target directory: +cd %VUFIND_HOME%\import +for %%a in (%BASEPATH%\*.xml) do ( + echo Processing %%a... + php import-xsl.php %%a %2 + rem Unfortunately, PHP doesn't seem to set apropriate errorlevels, so error + rem detection doesn't work under Windows like it does under Linux... however, + rem this code is retained in case PHP's behavior improves in the future! + if errorlevel 0 ( + move %%a %BASEPATH%\processed\ > nul + rem We processed a file, so we need to optimize later on: + set OPTIMIZE=1 + ) +) + +rem Optimize the index now that we are done (if necessary): +if not "%OPTIMIZE%!"=="1!" goto end +cd %VUFIND_HOME%\util +echo Optimizing index... +php optimize.php + +:end \ No newline at end of file diff --git a/harvest/batch-import-xsl.sh b/harvest/batch-import-xsl.sh new file mode 100644 index 0000000000000000000000000000000000000000..3ef97d235ea43a071d036f1b9d377e5440592d0d --- /dev/null +++ b/harvest/batch-import-xsl.sh @@ -0,0 +1,74 @@ +#!/bin/sh + +# Make sure VUFIND_HOME is set: +if [ -z "$VUFIND_HOME" ] +then + echo "Please set the VUFIND_HOME environment variable." + exit 1 +fi + +# Make sure command line parameter was included: +if [ -z "$2" ] +then + echo "This script processes a batch of harvested XML records using the specified XSL" + echo "import configuration file." + echo "" + echo "Usage: `basename $0` [harvest subdirectory] [properties file]" + echo "" + echo "[harvest subdirectory] is a directory name created by the OAI-PMH harvester." + echo "This script will search the harvest subdirectories of the directories defined" + echo "by the VUFIND_LOCAL_DIR and VUFIND_HOME environment variables." + echo "" + echo "[properties file] is a configuration file found in the import subdirectory of" + echo "either your VUFIND_LOCAL_DIR or VUFIND_HOME directory." + echo "" + echo "Example: `basename $0` oai_source ojs.properties" + exit 1 +fi + +# Check if the path is valid: +BASEPATH="$VUFIND_LOCAL_DIR/harvest/$1" +if [ ! -d $BASEPATH ] +then + BASEPATH="$VUFIND_HOME/harvest/$1" +fi +if [ ! -d $BASEPATH ] +then + echo "Directory $BASEPATH does not exist!" + exit 1 +fi + +# Create log/processed directories as needed: +if [ ! -d $BASEPATH/processed ] +then + mkdir $BASEPATH/processed +fi + +# Flag -- do we need to perform an optimize? +OPTIMIZE=0 + +# Process all the files in the target directory: +cd $VUFIND_HOME/import +for file in $BASEPATH/*.xml +do + if [ -f $file ] + then + echo "Processing $file ..." + php import-xsl.php $file $2 + # Only move the file into the "processed" folder if processing was successful: + if [ "$?" -eq "0" ] + then + mv $file $BASEPATH/processed/`basename $file` + # We processed a file, so we need to optimize later on: + OPTIMIZE=1 + fi + fi +done + +# Optimize the index now that we are done (if necessary): +if [ "$OPTIMIZE" -eq "1" ] +then + cd $VUFIND_HOME/util + echo "Optimizing index..." + php optimize.php +fi diff --git a/harvest/harvest_naf.php b/harvest/harvest_naf.php new file mode 100644 index 0000000000000000000000000000000000000000..87b27c5c74c9ee54380436c73822d77781b218a0 --- /dev/null +++ b/harvest/harvest_naf.php @@ -0,0 +1,32 @@ +<?php +/** + * Tool to harvest Library of Congress Name Authority File from OCLC. + * + * PHP version 5 + * + * Copyright (c) Demian Katz 2010. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * @category VuFind2 + * @package Harvest_Tools + * @author Demian Katz <demian.katz@villanova.edu> + * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License + * @link http://vufind.org/wiki/authority_control Wiki + */ + +// Load the Zend framework -- this will automatically trigger the appropriate +// controller action based on directory and file names +define('CLI_DIR', __DIR__); // save directory name of current script +require_once __DIR__ . '/../public/index.php'; \ No newline at end of file diff --git a/harvest/harvest_oai.php b/harvest/harvest_oai.php new file mode 100644 index 0000000000000000000000000000000000000000..60a8e1d910b10ea2aa63e4658a8f656850d85b40 --- /dev/null +++ b/harvest/harvest_oai.php @@ -0,0 +1,32 @@ +<?php +/** + * OAI-PMH Harvest Tool + * + * PHP version 5 + * + * Copyright (c) Demian Katz 2010. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * @category VuFind2 + * @package Harvest_Tools + * @author Demian Katz <demian.katz@villanova.edu> + * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License + * @link http://vufind.org/wiki/importing_records#oai-pmh_harvesting Wiki + */ + +// Load the Zend framework -- this will automatically trigger the appropriate +// controller action based on directory and file names +define('CLI_DIR', __DIR__); // save directory name of current script +require_once __DIR__ . '/../public/index.php'; \ No newline at end of file diff --git a/harvest/oai.ini b/harvest/oai.ini new file mode 100644 index 0000000000000000000000000000000000000000..67de45b3729a6ee02d0a157df0cea503267a6e2b --- /dev/null +++ b/harvest/oai.ini @@ -0,0 +1,82 @@ +; oai.ini -- OAI-PMH harvest settings. +; +; For every OAI-PMH source you would like to harvest, create a section like this: +; +; [section_name] +; url = http://oai.myuniversity.edu/ +; set = my_optional_set +; metadataPrefix = oai_dc +; idSearch[] = "/oai:myuniversity.edu:/" +; idReplace[] = "myprefix-" +; injectDate = false +; injectId = false +; injectSetName = false +; injectSetSpec = false +; dateGranularity = auto +; harvestedIdLog = harvest.log +; verbose = false +; +; The section_name may be passed to harvest_oai.php as a parameter to harvest only +; records from that source. This is also the directory name that records will be +; harvested into (a subdirectory of "harvest" under VUFIND_HOME). +; +; url is the base URL for the OAI-PMH source. +; +; set is the identifier of a set to harvest (normally found in the <setSpec> tag of +; an OAI-PMH ListSets response). Omit this setting to harvest all records. +; +; metadataPrefix is the metadata format to harvest (oai_dc will be used by default +; if the value is omitted). +; +; idPrefix is the OAI-specific prefix attached to ID values. If you provide the +; value here, it will be automatically stripped for you when generating filenames, +; injecting IDs and tracking deleted records. If you omit the setting, full +; OAI ids will be retained. [DEPRECATED -- use idSearch and idReplace instead] +; +; idSearch[] and idReplace[] may be used to manipulate IDs with regular expressions. +; This is useful for adding or removing prefixes and swapping out problematic +; characters. You may use multiple pairs of search and replace settings to perform +; multiple manipulations. +; +; injectDate may be set to an XML tag name in order to inject the datestamp of +; the record into the harvested metadata (enclosed in the specified tag). If +; omitted or set to false, no datestamp-related changes will be made to the +; harvested metadata. +; +; injectId may be set to an XML tag name in order to inject the ID of the record +; into the harvested metadata (enclosed in the specified tag). If omitted or set +; to false, no ID-related changes will be made to the harvested metadata. +; +; injectSetName may be set to an XML tag name in order to inject the setName value +; of the record into the harvested metadata (enclosed in the specified tag). If +; omitted or set to false, no setName-related changes will be made to the harvested +; metadata. +; +; injectSetSpec may be set to an XML tag name in order to inject the setSpec value +; of the record into the harvested metadata (enclosed in the specified tag). If +; omitted or set to false, no setSpec-related changes will be made to the harvested +; metadata. +; +; dateGranularity is the granularity used by the server for representing dates. +; This may be "YYYY-MM-DDThh:mm:ssZ," "YYYY-MM-DD" or "auto" (to query the server +; for details). The default is "auto." +; +; harvestedIdLog is a filename (inside your harvest directory) for a text file +; listing all non-deleted harvested records encountered. If you omit this setting, +; no log file will be generated. Subsequent harvests will append to the file if +; it already exists. +; +; verbose may be set to true in order to display more detailed output while +; harvesting; this may be useful for troubleshooting purposes, but it defaults to +; false. + +; SAMPLE CONFIGURATION FOR OPEN JOURNAL SYSTEMS +;[OJS] +;url = http://ojs.myuniversity.edu/oai +;metadataPrefix = oai_dc +;idSearch[] = "/^oai:myuniversity.edu:/" +;idReplace[] = "ojs-" +;idSearch[] = "/\//" +;idReplace[] = "-" +;injectId = "identifier" +;injectDate = "datestamp" \ No newline at end of file diff --git a/module/VuFind/CLI/Module.php b/module/VuFind/CLI/Module.php index d9de28cf96ae73454a7cf9bf63d70fd6e854a990..5b3c9359ad452e7bd8eca6a73782fd6668ac9c9b 100644 --- a/module/VuFind/CLI/Module.php +++ b/module/VuFind/CLI/Module.php @@ -33,9 +33,9 @@ class Module $filename = $args[0]; $pwd = $server->get('PWD', CLI_DIR); - // Convert base filename (minus .php extension) and containing directory - // name into action and controller, respectively: - $baseFilename = basename($filename); + // Convert base filename (minus .php extension and underscores) and + // containing directory name into action and controller, respectively: + $baseFilename = str_replace('_', '', basename($filename)); $baseFilename = substr($baseFilename, 0, strlen($baseFilename) - 4); $baseDirname = basename(dirname(realpath($pwd . '/' . $filename))); $routeMatch = new RouteMatch( diff --git a/module/VuFind/CLI/config/module.config.php b/module/VuFind/CLI/config/module.config.php index 92e27c0d7250e30c700746ac737148eddd871b0b..512418d341dfbb27bfd68b487652cb4080c53f27 100644 --- a/module/VuFind/CLI/config/module.config.php +++ b/module/VuFind/CLI/config/module.config.php @@ -4,6 +4,7 @@ namespace VuFind\CLI\Module\Configuration; $config = array( 'controllers' => array( 'invokables' => array( + 'harvest' => 'VuFind\CLI\Controller\HarvestController', 'import' => 'VuFind\CLI\Controller\ImportController', ), ), diff --git a/module/VuFind/src/VuFind/CLI/Controller/HarvestController.php b/module/VuFind/src/VuFind/CLI/Controller/HarvestController.php new file mode 100644 index 0000000000000000000000000000000000000000..a572c8f6e47a4c10eb54c88931fe0fb3842f2152 --- /dev/null +++ b/module/VuFind/src/VuFind/CLI/Controller/HarvestController.php @@ -0,0 +1,116 @@ +<?php +/** + * CLI Controller Module + * + * PHP version 5 + * + * Copyright (C) Villanova University 2010. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * @category VuFind2 + * @package Controller + * @author Chris Hallberg <challber@villanova.edu> + * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License + * @link http://vufind.org/wiki/building_a_recommendations_module Wiki + */ +namespace VuFind\CLI\Controller; +use VuFind\Config\Reader as ConfigReader, VuFind\Harvester\NAF, VuFind\Harvester\OAI; + +/** + * This controller handles various command-line tools + * + * @category VuFind2 + * @package Controller + * @author Chris Hallberg <challber@villanova.edu> + * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License + * @link http://vufind.org/wiki/building_a_recommendations_module Wiki + */ +class HarvestController extends AbstractBase +{ + /** + * Harvest the LC Name Authority File. + * + * @return void + */ + public function harvestnafAction() + { + $this->checkLocalSetting(); + + // Perform the harvest. Note that first command line parameter + // may be used to start at a particular date. + try { + $harvest = new NAF(); + $argv = $this->consoleOpts->getRemainingArgs(); + if (isset($argv[0])) { + $harvest->setStartDate($argv[0]); + } + $harvest->launch(); + } catch (\Exception $e) { + echo $e->getMessage() . "\n"; + return $this->getFailureResponse(); + } + return $this->getSuccessResponse(); + } + + /** + * Harvest OAI-PMH records. + * + * @return void + */ + public function harvestoaiAction() + { + $this->checkLocalSetting(); + + // Read Config files + $configFile = ConfigReader::getConfigPath('oai.ini', 'harvest'); + $oaiSettings = @parse_ini_file($configFile, true); + if (empty($oaiSettings)) { + echo "Please add OAI-PMH settings to oai.ini.\n"; + return $this->getFailureResponse(); + } + + // If first command line parameter is set, see if we can limit to just the + // specified OAI harvester: + $argv = $this->consoleOpts->getRemainingArgs(); + if (isset($argv[0])) { + if (isset($oaiSettings[$argv[0]])) { + $oaiSettings = array($argv[0] => $oaiSettings[$argv[0]]); + } else { + echo "Could not load settings for {$argv[0]}.\n"; + return $this->getFailureResponse(); + } + } + + // Loop through all the settings and perform harvests: + $processed = 0; + foreach ($oaiSettings as $target => $settings) { + if (!empty($target) && !empty($settings)) { + echo "Processing {$target}...\n"; + try { + $harvest = new OAI($target, $settings); + $harvest->launch(); + } catch (\Exception $e) { + echo $e->getMessage() . "\n"; + return $this->getFailureResponse(); + } + $processed++; + } + } + + // All done. + echo "Completed without errors -- {$processed} source(s) processed.\n"; + return $this->getSuccessResponse(); + } +} diff --git a/module/VuFind/src/VuFind/Harvester/NAF.php b/module/VuFind/src/VuFind/Harvester/NAF.php new file mode 100644 index 0000000000000000000000000000000000000000..f8a944abd0806f4db6c19b5aa8039adfdbc2ef6f --- /dev/null +++ b/module/VuFind/src/VuFind/Harvester/NAF.php @@ -0,0 +1,525 @@ +<?php +/** + * Tool to harvest Library of Congress Name Authority File from OCLC. + * + * PHP version 5 + * + * Copyright (c) Demian Katz 2010. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * @category VuFind2 + * @package Harvest_Tools + * @author Demian Katz <demian.katz@villanova.edu> + * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License + * @link http://vufind.org/wiki/authority_control Wiki + */ +namespace VuFind\Harvester; +use VuFind\Connection\SRU; + +/** + * NAF Class + * + * This class harvests OCLC's Name Authority File to MARC-XML documents on the + * local disk. + * + * @category VuFind2 + * @package Harvest_Tools + * @author Demian Katz <demian.katz@villanova.edu> + * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License + * @link http://vufind.org/wiki/authority_control Wiki + */ +class NAF +{ + protected $sru; // SRU connection + protected $basePath; // Directory for storing harvested files + protected $lastHarvestFile; // File for tracking last harvest date + + // Start scanning at an arbitrary date known to be earlier than the + // oldest possible document. + protected $startDate = '1900-01-01'; + + /** + * Constructor. + */ + public function __construct() + { + // Don't time out during harvest!! + set_time_limit(0); + + // Set up base directory for harvested files: + if (strlen(LOCAL_OVERRIDE_DIR) > 0) { + $home = LOCAL_OVERRIDE_DIR; + } else { + $home = realpath(APPLICATION_PATH . '/..'); + } + $this->basePath = $home . '/harvest/lcnaf/'; + if (!is_dir($this->basePath)) { + if (!mkdir($this->basePath)) { + throw new \Exception("Problem creating directory {$this->basePath}."); + } + } + + // Check if there is a file containing a start date: + $this->lastHarvestFile = $this->basePath . 'last_harvest.txt'; + $this->loadLastHarvestedDate(); + + // Set up SRU connection: + $this->sru = new SRU('http://alcme.oclc.org/srw/search/lcnaf'); + } + + /** + * Set a start date for the harvest (only harvest records AFTER this date). + * + * @param string $date Start date (YYYY-MM-DD format). + * + * @return void + */ + public function setStartDate($date) + { + $this->startDate = $date; + } + + /** + * Harvest all available documents. + * + * @return void + */ + public function launch() + { + $this->scanDates($this->startDate); + $this->detectDeletes(); + } + + /** + * Harvest LCCNs from OCLC to a file. + * + * @return string Filename of harvested data. + */ + protected function harvestOCLCIds() + { + // Harvest all LCCNs to a file: + $lccnListFile = dirname(__FILE__) . '/lcnaf/lccn-list-' . time() . '.tmp'; + $lccnList = fopen($lccnListFile, 'w'); + if (!$lccnList) { + throw new \Exception('Problem opening file: ' . $lccnListFile . "."); + } + $lccn = ''; + do { + $lccn = $this->scanLCCNs($lccnList, $lccn); + } while ($lccn); + fclose($lccnList); + return $lccnListFile; + } + + /** + * Harvest IDs from local Solr index to a file. + * + * @return string Filename of harvested data. + */ + protected function harvestLocalIds() + { + // Harvest all local IDs to a file: + $localListFile = dirname(__FILE__) . '/lcnaf/id-list-' . time() . '.tmp'; + $localList = fopen($localListFile, 'w'); + if (!$localList) { + throw new \Exception('Problem opening file: ' . $localListFile . "."); + } + $id = ''; + $solr = ConnectionManager::connectToIndex('SolrAuth'); + do { + echo "Reading IDs starting with '{$id}'...\n"; + $list = $solr->getTerms('id', $id, 10000); + if (isset($list['terms']['id']) && !empty($list['terms']['id'])) { + foreach ($list['terms']['id'] as $id => $count) { + fwrite($localList, $id . "\n"); + } + } else { + $id = false; + } + } while ($id); + fclose($localList); + return $localListFile; + } + + /** + * Given sorted ID lists, determine which have been deleted and which are + * missing from the index. + * + * @param string $sortedOclcFile File containing list of remote OCLC IDs. + * @param string $sortedLocalFile File containing list of local IDs. + * @param string $deletedFile Filename to write deleted list to. + * + * @return void + */ + protected function performDeleteComparison($sortedOclcFile, $sortedLocalFile, + $deletedFile + ) { + $oclcIn = fopen($sortedOclcFile, 'r'); + if (!$oclcIn) { + throw new \Exception("Can't open {$sortedOclcFile}"); + } + $localIn = fopen($sortedLocalFile, 'r'); + if (!$localIn) { + throw new \Exception("Can't open {$sortedLocalFile}"); + } + $deleted = fopen($deletedFile, 'w'); + if (!$deleted) { + throw new \Exception("Can't open {$deletedFile}"); + } + + // Flags to control which file(s) we read from: + $readOclc = $readLocal = true; + + // Loop until we reach the ends of both files: + do { + // Read the next line from each file if necessary: + if ($readOclc) { + $oclcCurrent = fgets($oclcIn); + } + if ($readLocal) { + $localCurrent = fgets($localIn); + } + + if (!$localCurrent || strcmp($oclcCurrent, $localCurrent) < 0) { + // If OCLC is less than local (or we've reached the end of the + // local file), we've found a record that hasn't been indexed yet; + // no action is needed -- just skip it and read the next OCLC line. + $readOclc = true; + $readLocal = false; + } else if (!$oclcCurrent || strcmp($oclcCurrent, $localCurrent) > 0) { + // If OCLC is greater than local (or we've reached the end of the + // OCLC file), we've found a deleted record; write it to file and + // read the next local value. + fputs($deleted, $localCurrent); + $readOclc = false; + $readLocal = true; + } else { + // If current lines match, just read another pair of lines: + $readOclc = $readLocal = true; + } + } while ($oclcCurrent || $localCurrent); + + fclose($oclcIn); + fclose($localIn); + fclose($deleted); + } + + /** + * Scan the index for deleted records. + * + * @return void + */ + protected function detectDeletes() + { + // Harvest IDs from local and OCLC indexes: + $oclcFile = $this->harvestOCLCIds(); + $localFile = $this->harvestLocalIds(); + + // Sort the two lists consistently: + $sortedOclcFile = dirname(__FILE__) . '/lcnaf/lccn-sorted.txt'; + $sortedLocalFile = dirname(__FILE__) . '/lcnaf/id-sorted.txt'; + + exec("sort < {$oclcFile} > {$sortedOclcFile}"); + exec("sort < {$localFile} > {$sortedLocalFile}"); + + // Delete unsorted data files: + unlink($oclcFile); + unlink($localFile); + + // Diff the files in order to generate a .delete file so we can remove + // obsolete records from the Solr index: + $deletedFile = dirname(__FILE__) . '/lcnaf/' . time() . '.delete'; + $this->performDeleteComparison( + $sortedOclcFile, $sortedLocalFile, $deletedFile + ); + + // Deleted sorted data files now that we are done with them: + unlink($sortedOclcFile); + unlink($sortedLocalFile); + } + + /** + * Normalize an LCCN to match an ID generated by the LCNAF SolrMarc import + * process (see the various .bsh files in import/index_scripts). + * + * @param string $lccn Regular LCCN + * + * @return string Normalized LCCN + */ + protected function normalizeLCCN($lccn) + { + // Remove whitespace: + $lccn = str_replace(" ", "", $lccn); + + // Chop off anything following a forward slash: + $parts = explode('/', $lccn, 2); + $lccn = $parts[0]; + + // Normalize any characters following a hyphen to at least six digits: + $parts = explode('-', $lccn, 2); + if (count($parts) > 1) { + $secondPart = $parts[1]; + while (strlen($secondPart) < 6) { + $secondPart = "0" . $secondPart; + } + $lccn = $parts[0] . $secondPart; + } + + // Send back normalized LCCN: + return 'lcnaf-' . $lccn; + } + + /** + * Recursively obtain all of the LCCNs from the LCNAF index. + * + * @param resource $handle File handle to write normalized LCCNs to. + * @param string $start Starting point in list to read from + * @param int $retry Retry counter (in case of connection problems). + * + * @return string Where to start the next scan to continue the + * operation (boolean false when finished). + */ + protected function scanLCCNs($handle, $start = '', $retry = 0) + { + echo "Scanning LCCNs after \"{$start}\"...\n"; + + // Find all dates AFTER the specified start date + try { + $result = $this->sru->scan('local.LCCN="' . $start . '"', 0, 250); + } catch (\Exception $e) { + $result = false; + } + if (!empty($result)) { + // Parse the response: + $result = simplexml_load_string($result); + if (!$result) { + // We experienced a failure; let's retry three times before we + // give up and report failure. + if ($retry > 2) { + throw new \Exception("Problem loading XML: {$result}"); + } else { + echo "Problem loading XML; retrying...\n"; + // Wait a few seconds in case that helps... + sleep(5); + + return $this->scanLCCNs($handle, $start, $retry + 1); + } + } + + // Extract terms from the response: + $namespaces = $result->getDocNamespaces(); + $result->registerXPathNamespace('ns', $namespaces['']); + $result = $result->xpath('ns:terms/ns:term'); + + // No terms? We've hit the end of the road! + if (!is_array($result)) { + return; + } + + // Process all the dates in this batch: + foreach ($result as $term) { + $lccn = (string)$term->value; + $count = (int)$term->numberOfRecords; + fwrite($handle, $this->normalizeLCCN($lccn) . "\n"); + } + } + + // Continue scanning with results following the last date encountered + // in the loop above: + return isset($lccn) ? $lccn : false; + } + + /** + * Retrieve the date from the "last harvested" file and use it as our start + * date if it is available. + * + * @return void + */ + protected function loadLastHarvestedDate() + { + if (file_exists($this->lastHarvestFile)) { + $lines = file($this->lastHarvestFile); + if (is_array($lines)) { + $date = trim($lines[0]); + if (!empty($date)) { + $this->setStartDate(trim($date)); + } + } + } + } + + /** + * Save a date to the "last harvested" file. + * + * @param string $date Date to save. + * + * @return void + */ + protected function saveLastHarvestedDate($date) + { + file_put_contents($this->lastHarvestFile, $date); + } + + /** + * Retrieve records modified on the specified date. + * + * @param string $date Date of modification for retrieved records + * @param int $count Number of records expected (double-check) + * + * @return void + */ + protected function processDate($date, $count) + { + // Don't reload data we already have! + $path = $this->basePath . $date . '.xml'; + if (file_exists($path)) { + return; + } + + echo "Processing records for {$date}...\n"; + + // Open the output file: + $file = fopen($path, 'w'); + $startTag = '<mx:collection xmlns:mx="http://www.loc.gov/MARC21/slim">'; + if (!$file || !fwrite($file, $startTag)) { + unlink($path); + throw new \Exception("Unable to open {$path} for writing."); + } + + // Pull down all the records: + $start = 1; + $limit = 250; + $query = 'oai.datestamp="' . $date . '"'; + do { + $numFound = $this->getRecords($query, $start, $limit, $file); + $start += $numFound; + } while ($numFound == $limit); + + // Close the file: + if (!fwrite($file, '</mx:collection>') || !fclose($file)) { + unlink($path); + throw new \Exception("Problem closing file."); + } + + // Sanity check -- did we get as many records as we expected to? + $finalCount = $start - 1; + if ($finalCount != $count) { + // Delete the problem file so we can rebuild it later: + unlink($path); + throw new \Exception( + "Problem loading records for {$date} -- " . + "expected {$count}, retrieved {$finalCount}." + ); + } + + // Update the "last harvested" file: + $this->saveLastHarvestedDate($date); + } + + /** + * Pull down records from LC NAF. + * + * @param string $query Search query for loading records + * @param int $start Index of first record to load + * @param int $limit Maximum number of records to load + * @param int $file Open file handle to write records to + * + * @return int Actual number of records loaded + */ + protected function getRecords($query, $start, $limit, $file) + { + // Retrieve the records: + $xml = $this->sru->search( + $query, $start, $limit, null, 'info:srw/schema/1/marcxml-v1.1', false + ); + $result = simplexml_load_string($xml); + if (!$result) { + throw new \Exception("Problem loading XML: {$xml}"); + } + + // Extract the records from the response: + $namespaces = $result->getDocNamespaces(); + $result->registerXPathNamespace('ns', $namespaces['']); + $result->registerXPathNamespace('mx', 'http://www.loc.gov/MARC21/slim'); + $result = $result->xpath('ns:records/ns:record/ns:recordData/mx:record'); + + // No records? We've hit the end of the line! + if (empty($result)) { + return 0; + } + + // Process records and return a bad value if we have trouble writing + // (in order to ensure that we die and can retry later): + foreach ($result as $current) { + if (!fwrite($file, $current->asXML())) { + return 0; + } + } + + // If we found less than the limit, we've hit the end of the list; + // otherwise, we should return the index of the next record to load: + return count($result); + } + + /** + * Recursively scan the remote index to find dates we can retrieve. + * + * @param string $start The date to use as the basis for scanning; this date + * will NOT be included in results. + * + * @return void + */ + protected function scanDates($start) + { + echo "Scanning dates after {$start}...\n"; + + // Find all dates AFTER the specified start date + try { + $result = $this->sru->scan('oai.datestamp="' . $start . '"', 0, 250); + } catch (\Exception $e) { + $result = false; + } + if (!empty($result)) { + // Parse the response: + $result = simplexml_load_string($result); + if (!$result) { + throw new \Exception("Problem loading XML: {$result}"); + } + + // Extract terms from the response: + $namespaces = $result->getDocNamespaces(); + $result->registerXPathNamespace('ns', $namespaces['']); + $result = $result->xpath('ns:terms/ns:term'); + + // No terms? We've hit the end of the road! + if (!is_array($result)) { + return; + } + + // Process all the dates in this batch: + foreach ($result as $term) { + $date = (string)$term->value; + $count = (int)$term->numberOfRecords; + $this->processDate($date, $count); + } + } + + // Continue scanning with results following the last date encountered + // in the loop above: + if (isset($date)) { + $this->scanDates($date); + } + } +} diff --git a/module/VuFind/src/VuFind/Harvester/OAI.php b/module/VuFind/src/VuFind/Harvester/OAI.php new file mode 100644 index 0000000000000000000000000000000000000000..c4791087cc8e4c948fd5709c27d0bbc02314ceff --- /dev/null +++ b/module/VuFind/src/VuFind/Harvester/OAI.php @@ -0,0 +1,600 @@ +<?php +/** + * OAI-PMH Harvest Tool + * + * PHP version 5 + * + * Copyright (c) Demian Katz 2010. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * @category VuFind2 + * @package Harvest_Tools + * @author Demian Katz <demian.katz@villanova.edu> + * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License + * @link http://vufind.org/wiki/importing_records#oai-pmh_harvesting Wiki + */ +namespace VuFind\Harvester; +use VuFind\Http\Client; + +/** + * OAI Class + * + * This class harvests records via OAI-PMH using settings from oai.ini. + * + * @category VuFind2 + * @package Harvest_Tools + * @author Demian Katz <demian.katz@villanova.edu> + * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License + * @link http://vufind.org/wiki/importing_records#oai-pmh_harvesting Wiki + */ +class OAI +{ + protected $baseURL; // URL to harvest from + protected $set = null; // Target set to harvest (null for all records) + protected $metadata = 'oai_dc'; // Metadata type to harvest + protected $idPrefix = ''; // OAI prefix to strip from ID values + protected $idSearch = array(); // Regular expression searches + protected $idReplace = array(); // Replacements for regular expression matches + protected $basePath; // Directory for storing harvested files + protected $lastHarvestFile; // File for tracking last harvest date + protected $startDate = null; // Harvest start date (null for all records) + protected $granularity = 'auto'; // Date granularity + protected $injectId = false; // Tag to use for injecting IDs into XML + protected $injectSetSpec = false; // Tag to use for injecting setSpecs + protected $injectSetName = false; // Tag to use for injecting set names + protected $injectDate = false; // Tag to use for injecting datestamp + protected $setNames = array(); // Associative array of setSpec => setName + protected $harvestedIdLog = false;// Filename for logging harvested IDs. + protected $verbose = false; // Should we display debug output? + + // As we harvest records, we want to track the most recent date encountered + // so we can set a start point for the next harvest. + protected $endDate = 0; + + /** + * Constructor. + * + * @param string $target Target directory for harvest. + * @param array $settings OAI-PMH settings from oai.ini. + */ + public function __construct($target, $settings) + { + // Don't time out during harvest!! + set_time_limit(0); + + // Set up base directory for harvested files: + $this->setBasePath($target); + + // Check if there is a file containing a start date: + $this->lastHarvestFile = $this->basePath . 'last_harvest.txt'; + $this->loadLastHarvestedDate(); + + // Set up base URL: + if (empty($settings['url'])) { + throw new \Exception("Missing base URL for {$target}."); + } + $this->baseURL = $settings['url']; + if (isset($settings['set'])) { + $this->set = $settings['set']; + } + if (isset($settings['metadataPrefix'])) { + $this->metadata = $settings['metadataPrefix']; + } + if (isset($settings['idPrefix'])) { + $this->idPrefix = $settings['idPrefix']; + } + if (isset($settings['idSearch'])) { + $this->idSearch = $settings['idSearch']; + } + if (isset($settings['idReplace'])) { + $this->idReplace = $settings['idReplace']; + } + if (isset($settings['harvestedIdLog'])) { + $this->harvestedIdLog = $settings['harvestedIdLog']; + } + if (isset($settings['injectId'])) { + $this->injectId = $settings['injectId']; + } + if (isset($settings['injectSetSpec'])) { + $this->injectSetSpec = $settings['injectSetSpec']; + } + if (isset($settings['injectSetName'])) { + $this->injectSetName = $settings['injectSetName']; + $this->loadSetNames(); + } + if (isset($settings['injectDate'])) { + $this->injectDate = $settings['injectDate']; + } + if (isset($settings['dateGranularity'])) { + $this->granularity = $settings['dateGranularity']; + } + if (isset($settings['verbose'])) { + $this->verbose = $settings['verbose']; + } + if ($this->granularity == 'auto') { + $this->loadGranularity(); + } + } + + /** + * Set a start date for the harvest (only harvest records AFTER this date). + * + * @param string $date Start date (YYYY-MM-DD format). + * + * @return void + */ + public function setStartDate($date) + { + $this->startDate = $date; + } + + /** + * Harvest all available documents. + * + * @return void + */ + public function launch() + { + // Start harvesting at the requested date: + $token = $this->getRecordsByDate($this->startDate, $this->set); + + // Keep harvesting as long as a resumption token is provided: + while ($token !== false) { + $token = $this->getRecordsByToken($token); + } + } + + /** + * Set up directory structure for harvesting (support method for constructor). + * + * @param string $target The OAI-PMH target directory to create. + * + * @return void + */ + protected function setBasePath($target) + { + // Get the base VuFind path: + if (strlen(LOCAL_OVERRIDE_DIR) > 0) { + $home = LOCAL_OVERRIDE_DIR; + } else { + $home = realpath(APPLICATION_PATH . '/..'); + } + + // Build the full harvest path: + $this->basePath = $home . '/harvest/' . $target . '/'; + + // Create the directory if it does not already exist: + if (!is_dir($this->basePath)) { + if (!mkdir($this->basePath)) { + throw new \Exception("Problem creating directory {$this->basePath}."); + } + } + } + + /** + * Retrieve the date from the "last harvested" file and use it as our start + * date if it is available. + * + * @return void + */ + protected function loadLastHarvestedDate() + { + if (file_exists($this->lastHarvestFile)) { + $lines = file($this->lastHarvestFile); + if (is_array($lines)) { + $date = trim($lines[0]); + if (!empty($date)) { + $this->setStartDate(trim($date)); + } + } + } + } + + /** + * Normalize a date to a Unix timestamp. + * + * @param string $date Date (ISO-8601 or YYYY-MM-DD HH:MM:SS) + * + * @return integer Unix timestamp (or false if $date invalid) + */ + protected function normalizeDate($date) + { + // Remove timezone markers -- we don't want PHP to outsmart us by adjusting + // the time zone! + $date = str_replace(array('T', 'Z'), array(' ', ''), $date); + + // Translate to a timestamp: + return strtotime($date); + } + + /** + * Save a date to the "last harvested" file. + * + * @param string $date Date to save. + * + * @return void + */ + protected function saveLastHarvestedDate($date) + { + file_put_contents($this->lastHarvestFile, $date); + } + + /** + * Make an OAI-PMH request. Die if there is an error; return a SimpleXML object + * on success. + * + * @param string $verb OAI-PMH verb to execute. + * @param array $params GET parameters for ListRecords method. + * + * @return object SimpleXML-formatted response. + */ + protected function sendRequest($verb, $params = array()) + { + // Debug: + if ($this->verbose) { + echo "Sending request: verb = {$verb}, params = "; + print_r($params); + } + + // Set up retry loop: + while (true) { + // Set up the request: + $request = new Client( + null, array('timeout' => 60) // TODO: make timeout configurable + ); + $request->setUri($this->baseURL); + + // Load request parameters: + $query = $request->getRequest()->getQuery(); + $query->set('verb', $verb); + foreach ($params as $key => $value) { + $query->set($key, $value); + } + + // Perform request and die on error: + $result = $request->setMethod('GET')->send(); + if ($result->getStatusCode() == 503) { + $delayHeader = $result->getHeaders()->get('Retry-After'); + $delay = is_object($delayHeader) + ? $delayHeader->getDeltaSeconds() : 0; + if ($delay > 0) { + if ($this->verbose) { + echo "Received 503 response; waiting {$delay} seconds...\n"; + } + sleep($delay); + } + } else if (!$result->isSuccess()) { + throw new \Exception('HTTP Error'); + } else { + // If we didn't get an error, we can leave the retry loop: + break; + } + } + + // If we got this far, there was no error -- send back response. + return $this->processResponse($result->getBody()); + } + + /** + * Process an OAI-PMH response into a SimpleXML object. Die if an error is + * detected. + * + * @param string $xml OAI-PMH response XML. + * + * @return object SimpleXML-formatted response. + */ + protected function processResponse($xml) + { + // Parse the XML: + $result = simplexml_load_string($xml); + if (!$result) { + throw new \Exception("Problem loading XML: {$xml}"); + } + + // Detect errors and die if one is found: + if ($result->error) { + $attribs = $result->error->attributes(); + throw new \Exception( + "OAI-PMH error -- code: {$attribs['code']}, " . + "value: {$result->error}" + ); + } + + // If we got this far, we have a valid response: + return $result; + } + + /** + * Get the filename for a specific record ID. + * + * @param string $id ID of record to save. + * @param string $ext File extension to use. + * + * @return string Full path + filename. + */ + protected function getFilename($id, $ext) + { + return $this->basePath . time() . '_' . + preg_replace('/[^\w]/', '_', $id) . '.' . $ext; + } + + /** + * Create a tracking file to record the deletion of a record. + * + * @param string $id ID of deleted record. + * + * @return void + */ + protected function saveDeletedRecord($id) + { + $filename = $this->getFilename($id, 'delete'); + file_put_contents($filename, $id); + } + + /** + * Save a record to disk. + * + * @param string $id ID of record to save. + * @param object $record Record to save (in SimpleXML format). + * + * @return void + */ + protected function saveRecord($id, $record) + { + if (!isset($record->metadata)) { + throw new \Exception("Unexpected missing record metadata."); + } + + // Extract the actual metadata from inside the <metadata></metadata> tags; + // there is probably a cleaner way to do this, but this simple method avoids + // the complexity of dealing with namespaces in SimpleXML: + $xml = trim($record->metadata->asXML()); + $xml = preg_replace('/(^<metadata>)|(<\/metadata>$)/m', '', $xml); + + // If we are supposed to inject any values, do so now inside the first + // tag of the file: + $insert = ''; + if (!empty($this->injectId)) { + $insert .= "<{$this->injectId}>" . htmlspecialchars($id) . + "</{$this->injectId}>"; + } + if (!empty($this->injectDate)) { + $insert .= "<{$this->injectDate}>" . + htmlspecialchars((string)$record->header->datestamp) . + "</{$this->injectDate}>"; + } + if (!empty($this->injectSetSpec)) { + if (isset($record->header->setSpec)) { + foreach ($record->header->setSpec as $current) { + $insert .= "<{$this->injectSetSpec}>" . + htmlspecialchars((string)$current) . + "</{$this->injectSetSpec}>"; + } + } + } + if (!empty($this->injectSetName)) { + if (isset($record->header->setSpec)) { + foreach ($record->header->setSpec as $current) { + $name = $this->setNames[(string)$current]; + $insert .= "<{$this->injectSetName}>" . + htmlspecialchars($name) . + "</{$this->injectSetName}>"; + } + } + } + if (!empty($insert)) { + $xml = preg_replace('/>/', '>' . $insert, $xml, 1); + } + + // Save our XML: + file_put_contents($this->getFilename($id, 'xml'), trim($xml)); + } + + /** + * Load date granularity from the server. + * + * @return void + */ + protected function loadGranularity() + { + echo "Autodetecting date granularity... "; + $response = $this->sendRequest('Identify'); + $this->granularity = (string)$response->Identify->granularity; + echo "found {$this->granularity}.\n"; + } + + /** + * Load set list from the server. + * + * @return void + */ + protected function loadSetNames() + { + echo "Loading set list... "; + + // On the first pass through the following loop, we want to get the + // first page of sets without using a resumption token: + $params = array(); + + // Grab set information until we have it all (at which point we will + // break out of this otherwise-infinite loop): + while (true) { + // Process current page of results: + $response = $this->sendRequest('ListSets', $params); + if (isset($response->ListSets->set)) { + foreach ($response->ListSets->set as $current) { + $spec = (string)$current->setSpec; + $name = (string)$current->setName; + if (!empty($spec)) { + $this->setNames[$spec] = $name; + } + } + } + + // Is there a resumption token? If so, continue looping; if not, + // we're done! + if (isset($response->ListSets->resumptionToken) + && !empty($response->ListSets->resumptionToken) + ) { + $params['resumptionToken'] + = (string)$response->ListSets->resumptionToken; + } else { + echo "found " . count($this->setNames) . "\n"; + return; + } + } + } + + /** + * Extract the ID from a record object (support method for _processRecords()). + * + * @param object $record SimpleXML record. + * + * @return string The ID value. + */ + protected function extractID($record) + { + // Normalize to string: + $id = (string)$record->header->identifier; + + // Strip prefix if found: + if (substr($id, 0, strlen($this->idPrefix)) == $this->idPrefix) { + $id = substr($id, strlen($this->idPrefix)); + } + + // Apply regular expression matching: + if (!empty($this->idSearch)) { + $id = preg_replace($this->idSearch, $this->idReplace, $id); + } + + // Return final value: + return $id; + } + + /** + * Save harvested records to disk and track the end date. + * + * @param object $records SimpleXML records. + * + * @return void + */ + protected function processRecords($records) + { + echo 'Processing ' . count($records) . " records...\n"; + + // Array for tracking successfully harvested IDs: + $harvestedIds = array(); + + // Loop through the records: + foreach ($records as $record) { + // Die if the record is missing its header: + if (empty($record->header)) { + throw new \Exception("Unexpected missing record header."); + } + + // Get the ID of the current record: + $id = $this->extractID($record); + + // Save the current record, either as a deleted or as a regular file: + $attribs = $record->header->attributes(); + if (strtolower($attribs['status']) == 'deleted') { + $this->saveDeletedRecord($id); + } else { + $this->saveRecord($id, $record); + $harvestedIds[] = $id; + } + + // If the current record's date is newer than the previous end date, + // remember it for future reference: + $date = $this->normalizeDate($record->header->datestamp); + if ($date && $date > $this->endDate) { + $this->endDate = $date; + } + } + + // Do we have IDs to log and a log filename? If so, log them: + if (!empty($this->harvestedIdLog) && !empty($harvestedIds)) { + $file = fopen($this->basePath . $this->harvestedIdLog, 'a'); + if (!$file) { + throw new \Exception("Problem opening {$this->harvestedIdLog}."); + } + fputs($file, implode(PHP_EOL, $harvestedIds)); + fclose($file); + } + } + + /** + * Harvest records using OAI-PMH. + * + * @param array $params GET parameters for ListRecords method. + * + * @return mixed Resumption token if provided, false if finished + */ + protected function getRecords($params) + { + // Make the OAI-PMH request: + $response = $this->sendRequest('ListRecords', $params); + + // Save the records from the response: + if ($response->ListRecords->record) { + $this->processRecords($response->ListRecords->record); + } + + // If we have a resumption token, keep going; otherwise, we're done -- save + // the end date. + if (isset($response->ListRecords->resumptionToken) + && !empty($response->ListRecords->resumptionToken) + ) { + return $response->ListRecords->resumptionToken; + } else if ($this->endDate > 0) { + $dateFormat = ($this->granularity == 'YYYY-MM-DD') ? + 'Y-m-d' : 'Y-m-d\TH:i:s\Z'; + $this->saveLastHarvestedDate(date($dateFormat, $this->endDate)); + } + return false; + } + + /** + * Harvest records via OAI-PMH using date and set. + * + * @param string $date Harvest start date (null for all records). + * @param string $set Set to harvest (null for all records). + * + * @return mixed Resumption token if provided, false if finished + */ + protected function getRecordsByDate($date = null, $set = null) + { + $params = array('metadataPrefix' => $this->metadata); + if (!empty($date)) { + $params['from'] = $date; + } + if (!empty($set)) { + $params['set'] = $set; + } + return $this->getRecords($params); + } + + /** + * Harvest records via OAI-PMH using resumption token. + * + * @param string $token Resumption token. + * + * @return mixed Resumption token if provided, false if finished + */ + protected function getRecordsByToken($token) + { + return $this->getRecords(array('resumptionToken' => (string)$token)); + } +}