From 2329a5af076978b8f626db911fda3af08b281ffb Mon Sep 17 00:00:00 2001 From: Demian Katz <demian.katz@villanova.edu> Date: Wed, 13 Mar 2013 10:34:12 -0400 Subject: [PATCH] Removing NAF harvest tool; the API this depends on is broken and will not be fixed. Authority data can now be conveniently downloaded directly from OCLC: http://www.oclc.org/research/activities/fast/download.html http://viaf.org/viaf/data/ --- harvest/harvest_naf.php | 32 -- module/VuFind/src/VuFind/Harvester/NAF.php | 529 ------------------ .../Controller/HarvestController.php | 29 +- 3 files changed, 1 insertion(+), 589 deletions(-) delete mode 100644 harvest/harvest_naf.php delete mode 100644 module/VuFind/src/VuFind/Harvester/NAF.php diff --git a/harvest/harvest_naf.php b/harvest/harvest_naf.php deleted file mode 100644 index 87b27c5c74c..00000000000 --- a/harvest/harvest_naf.php +++ /dev/null @@ -1,32 +0,0 @@ -<?php -/** - * Tool to harvest Library of Congress Name Authority File from OCLC. - * - * PHP version 5 - * - * Copyright (c) Demian Katz 2010. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * @category VuFind2 - * @package Harvest_Tools - * @author Demian Katz <demian.katz@villanova.edu> - * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License - * @link http://vufind.org/wiki/authority_control Wiki - */ - -// Load the Zend framework -- this will automatically trigger the appropriate -// controller action based on directory and file names -define('CLI_DIR', __DIR__); // save directory name of current script -require_once __DIR__ . '/../public/index.php'; \ No newline at end of file diff --git a/module/VuFind/src/VuFind/Harvester/NAF.php b/module/VuFind/src/VuFind/Harvester/NAF.php deleted file mode 100644 index 870feab09d3..00000000000 --- a/module/VuFind/src/VuFind/Harvester/NAF.php +++ /dev/null @@ -1,529 +0,0 @@ -<?php -/** - * Tool to harvest Library of Congress Name Authority File from OCLC. - * - * PHP version 5 - * - * Copyright (c) Demian Katz 2010. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * @category VuFind2 - * @package Harvest_Tools - * @author Demian Katz <demian.katz@villanova.edu> - * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License - * @link http://vufind.org/wiki/vufind2:developer_manual Wiki - */ -namespace VuFind\Harvester; -use VuFind\Connection\SRU, Zend\Console\Console; - -/** - * NAF Class - * - * This class harvests OCLC's Name Authority File to MARC-XML documents on the - * local disk. - * - * @category VuFind2 - * @package Harvest_Tools - * @author Demian Katz <demian.katz@villanova.edu> - * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License - * @link http://vufind.org/wiki/vufind2:developer_manual Wiki - */ -class NAF -{ - protected $sru; // SRU connection - protected $basePath; // Directory for storing harvested files - protected $lastHarvestFile; // File for tracking last harvest date - - // Start scanning at an arbitrary date known to be earlier than the - // oldest possible document. - protected $startDate = '1900-01-01'; - - /** - * Constructor. - * - * @param \Zend\Http\Client $client An HTTP client object - */ - public function __construct(\Zend\Http\Client $client) - { - // Don't time out during harvest!! - set_time_limit(0); - - // Set up base directory for harvested files: - if (strlen(LOCAL_OVERRIDE_DIR) > 0) { - $home = LOCAL_OVERRIDE_DIR; - } else { - $home = realpath(APPLICATION_PATH . '/..'); - } - $this->basePath = $home . '/harvest/lcnaf/'; - if (!is_dir($this->basePath)) { - if (!@mkdir($this->basePath)) { - throw new \Exception( - "Problem creating directory {$this->basePath}." - ); - } - } - - // Check if there is a file containing a start date: - $this->lastHarvestFile = $this->basePath . 'last_harvest.txt'; - $this->loadLastHarvestedDate(); - - // Set up SRU connection: - $this->sru = new SRU('http://alcme.oclc.org/srw/search/lcnaf', $client); - } - - /** - * Set a start date for the harvest (only harvest records AFTER this date). - * - * @param string $date Start date (YYYY-MM-DD format). - * - * @return void - */ - public function setStartDate($date) - { - $this->startDate = $date; - } - - /** - * Harvest all available documents. - * - * @return void - */ - public function launch() - { - $this->scanDates($this->startDate); - $this->detectDeletes(); - } - - /** - * Harvest LCCNs from OCLC to a file. - * - * @return string Filename of harvested data. - */ - protected function harvestOCLCIds() - { - // Harvest all LCCNs to a file: - $lccnListFile = dirname(__FILE__) . '/lcnaf/lccn-list-' . time() . '.tmp'; - $lccnList = fopen($lccnListFile, 'w'); - if (!$lccnList) { - throw new \Exception('Problem opening file: ' . $lccnListFile . "."); - } - $lccn = ''; - do { - $lccn = $this->scanLCCNs($lccnList, $lccn); - } while ($lccn); - fclose($lccnList); - return $lccnListFile; - } - - /** - * Harvest IDs from local Solr index to a file. - * - * @return string Filename of harvested data. - */ - protected function harvestLocalIds() - { - // Harvest all local IDs to a file: - $localListFile = dirname(__FILE__) . '/lcnaf/id-list-' . time() . '.tmp'; - $localList = fopen($localListFile, 'w'); - if (!$localList) { - throw new \Exception('Problem opening file: ' . $localListFile . "."); - } - $id = ''; - $solr = \VuFind\Connection\Manager::connectToIndex('SolrAuth'); - do { - Console::writeLine("Reading IDs starting with '{$id}'..."); - $list = $solr->getTerms('id', $id, 10000); - if (isset($list['terms']['id']) && !empty($list['terms']['id'])) { - foreach ($list['terms']['id'] as $id => $count) { - fwrite($localList, $id . "\n"); - } - } else { - $id = false; - } - } while ($id); - fclose($localList); - return $localListFile; - } - - /** - * Given sorted ID lists, determine which have been deleted and which are - * missing from the index. - * - * @param string $sortedOclcFile File containing list of remote OCLC IDs. - * @param string $sortedLocalFile File containing list of local IDs. - * @param string $deletedFile Filename to write deleted list to. - * - * @return void - */ - protected function performDeleteComparison($sortedOclcFile, $sortedLocalFile, - $deletedFile - ) { - $oclcIn = fopen($sortedOclcFile, 'r'); - if (!$oclcIn) { - throw new \Exception("Can't open {$sortedOclcFile}"); - } - $localIn = fopen($sortedLocalFile, 'r'); - if (!$localIn) { - throw new \Exception("Can't open {$sortedLocalFile}"); - } - $deleted = fopen($deletedFile, 'w'); - if (!$deleted) { - throw new \Exception("Can't open {$deletedFile}"); - } - - // Flags to control which file(s) we read from: - $readOclc = $readLocal = true; - - // Loop until we reach the ends of both files: - do { - // Read the next line from each file if necessary: - if ($readOclc) { - $oclcCurrent = fgets($oclcIn); - } - if ($readLocal) { - $localCurrent = fgets($localIn); - } - - if (!$localCurrent || strcmp($oclcCurrent, $localCurrent) < 0) { - // If OCLC is less than local (or we've reached the end of the - // local file), we've found a record that hasn't been indexed yet; - // no action is needed -- just skip it and read the next OCLC line. - $readOclc = true; - $readLocal = false; - } else if (!$oclcCurrent || strcmp($oclcCurrent, $localCurrent) > 0) { - // If OCLC is greater than local (or we've reached the end of the - // OCLC file), we've found a deleted record; write it to file and - // read the next local value. - fputs($deleted, $localCurrent); - $readOclc = false; - $readLocal = true; - } else { - // If current lines match, just read another pair of lines: - $readOclc = $readLocal = true; - } - } while ($oclcCurrent || $localCurrent); - - fclose($oclcIn); - fclose($localIn); - fclose($deleted); - } - - /** - * Scan the index for deleted records. - * - * @return void - */ - protected function detectDeletes() - { - // Harvest IDs from local and OCLC indexes: - $oclcFile = $this->harvestOCLCIds(); - $localFile = $this->harvestLocalIds(); - - // Sort the two lists consistently: - $sortedOclcFile = dirname(__FILE__) . '/lcnaf/lccn-sorted.txt'; - $sortedLocalFile = dirname(__FILE__) . '/lcnaf/id-sorted.txt'; - - exec("sort < {$oclcFile} > {$sortedOclcFile}"); - exec("sort < {$localFile} > {$sortedLocalFile}"); - - // Delete unsorted data files: - unlink($oclcFile); - unlink($localFile); - - // Diff the files in order to generate a .delete file so we can remove - // obsolete records from the Solr index: - $deletedFile = dirname(__FILE__) . '/lcnaf/' . time() . '.delete'; - $this->performDeleteComparison( - $sortedOclcFile, $sortedLocalFile, $deletedFile - ); - - // Deleted sorted data files now that we are done with them: - unlink($sortedOclcFile); - unlink($sortedLocalFile); - } - - /** - * Normalize an LCCN to match an ID generated by the LCNAF SolrMarc import - * process (see the various .bsh files in import/index_scripts). - * - * @param string $lccn Regular LCCN - * - * @return string Normalized LCCN - */ - protected function normalizeLCCN($lccn) - { - // Remove whitespace: - $lccn = str_replace(" ", "", $lccn); - - // Chop off anything following a forward slash: - $parts = explode('/', $lccn, 2); - $lccn = $parts[0]; - - // Normalize any characters following a hyphen to at least six digits: - $parts = explode('-', $lccn, 2); - if (count($parts) > 1) { - $secondPart = $parts[1]; - while (strlen($secondPart) < 6) { - $secondPart = "0" . $secondPart; - } - $lccn = $parts[0] . $secondPart; - } - - // Send back normalized LCCN: - return 'lcnaf-' . $lccn; - } - - /** - * Recursively obtain all of the LCCNs from the LCNAF index. - * - * @param resource $handle File handle to write normalized LCCNs to. - * @param string $start Starting point in list to read from - * @param int $retry Retry counter (in case of connection problems). - * - * @return string Where to start the next scan to continue the - * operation (boolean false when finished). - */ - protected function scanLCCNs($handle, $start = '', $retry = 0) - { - Console::writeLine("Scanning LCCNs after \"{$start}\"..."); - - // Find all dates AFTER the specified start date - try { - $result = $this->sru->scan('local.LCCN="' . $start . '"', 0, 250); - } catch (\Exception $e) { - $result = false; - } - if (!empty($result)) { - // Parse the response: - $result = simplexml_load_string($result); - if (!$result) { - // We experienced a failure; let's retry three times before we - // give up and report failure. - if ($retry > 2) { - throw new \Exception("Problem loading XML: {$result}"); - } else { - Console::writeLine("Problem loading XML; retrying..."); - // Wait a few seconds in case that helps... - sleep(5); - - return $this->scanLCCNs($handle, $start, $retry + 1); - } - } - - // Extract terms from the response: - $namespaces = $result->getDocNamespaces(); - $result->registerXPathNamespace('ns', $namespaces['']); - $result = $result->xpath('ns:terms/ns:term'); - - // No terms? We've hit the end of the road! - if (!is_array($result)) { - return; - } - - // Process all the dates in this batch: - foreach ($result as $term) { - $lccn = (string)$term->value; - $count = (int)$term->numberOfRecords; - fwrite($handle, $this->normalizeLCCN($lccn) . "\n"); - } - } - - // Continue scanning with results following the last date encountered - // in the loop above: - return isset($lccn) ? $lccn : false; - } - - /** - * Retrieve the date from the "last harvested" file and use it as our start - * date if it is available. - * - * @return void - */ - protected function loadLastHarvestedDate() - { - if (file_exists($this->lastHarvestFile)) { - $lines = file($this->lastHarvestFile); - if (is_array($lines)) { - $date = trim($lines[0]); - if (!empty($date)) { - $this->setStartDate(trim($date)); - } - } - } - } - - /** - * Save a date to the "last harvested" file. - * - * @param string $date Date to save. - * - * @return void - */ - protected function saveLastHarvestedDate($date) - { - file_put_contents($this->lastHarvestFile, $date); - } - - /** - * Retrieve records modified on the specified date. - * - * @param string $date Date of modification for retrieved records - * @param int $count Number of records expected (double-check) - * - * @return void - */ - protected function processDate($date, $count) - { - // Don't reload data we already have! - $path = $this->basePath . $date . '.xml'; - if (file_exists($path)) { - return; - } - - Console::writeLine("Processing records for {$date}..."); - - // Open the output file: - $file = fopen($path, 'w'); - $startTag = '<mx:collection xmlns:mx="http://www.loc.gov/MARC21/slim">'; - if (!$file || !fwrite($file, $startTag)) { - unlink($path); - throw new \Exception("Unable to open {$path} for writing."); - } - - // Pull down all the records: - $start = 1; - $limit = 250; - $query = 'oai.datestamp="' . $date . '"'; - do { - $numFound = $this->getRecords($query, $start, $limit, $file); - $start += $numFound; - } while ($numFound == $limit); - - // Close the file: - if (!fwrite($file, '</mx:collection>') || !fclose($file)) { - unlink($path); - throw new \Exception("Problem closing file."); - } - - // Sanity check -- did we get as many records as we expected to? - $finalCount = $start - 1; - if ($finalCount != $count) { - // Delete the problem file so we can rebuild it later: - unlink($path); - throw new \Exception( - "Problem loading records for {$date} -- " . - "expected {$count}, retrieved {$finalCount}." - ); - } - - // Update the "last harvested" file: - $this->saveLastHarvestedDate($date); - } - - /** - * Pull down records from LC NAF. - * - * @param string $query Search query for loading records - * @param int $start Index of first record to load - * @param int $limit Maximum number of records to load - * @param int $file Open file handle to write records to - * - * @return int Actual number of records loaded - */ - protected function getRecords($query, $start, $limit, $file) - { - // Retrieve the records: - $xml = $this->sru->search( - $query, $start, $limit, null, 'info:srw/schema/1/marcxml-v1.1', false - ); - $result = simplexml_load_string($xml); - if (!$result) { - throw new \Exception("Problem loading XML: {$xml}"); - } - - // Extract the records from the response: - $namespaces = $result->getDocNamespaces(); - $result->registerXPathNamespace('ns', $namespaces['']); - $result->registerXPathNamespace('mx', 'http://www.loc.gov/MARC21/slim'); - $result = $result->xpath('ns:records/ns:record/ns:recordData/mx:record'); - - // No records? We've hit the end of the line! - if (empty($result)) { - return 0; - } - - // Process records and return a bad value if we have trouble writing - // (in order to ensure that we die and can retry later): - foreach ($result as $current) { - if (!fwrite($file, $current->asXML())) { - return 0; - } - } - - // If we found less than the limit, we've hit the end of the list; - // otherwise, we should return the index of the next record to load: - return count($result); - } - - /** - * Recursively scan the remote index to find dates we can retrieve. - * - * @param string $start The date to use as the basis for scanning; this date - * will NOT be included in results. - * - * @return void - */ - protected function scanDates($start) - { - Console::writeLine("Scanning dates after {$start}..."); - - // Find all dates AFTER the specified start date - try { - $result = $this->sru->scan('oai.datestamp="' . $start . '"', 0, 250); - } catch (\Exception $e) { - $result = false; - } - if (!empty($result)) { - // Parse the response: - $result = simplexml_load_string($result); - if (!$result) { - throw new \Exception("Problem loading XML: {$result}"); - } - - // Extract terms from the response: - $namespaces = $result->getDocNamespaces(); - $result->registerXPathNamespace('ns', $namespaces['']); - $result = $result->xpath('ns:terms/ns:term'); - - // No terms? We've hit the end of the road! - if (!is_array($result)) { - return; - } - - // Process all the dates in this batch: - foreach ($result as $term) { - $date = (string)$term->value; - $count = (int)$term->numberOfRecords; - $this->processDate($date, $count); - } - } - - // Continue scanning with results following the last date encountered - // in the loop above: - if (isset($date)) { - $this->scanDates($date); - } - } -} diff --git a/module/VuFindConsole/src/VuFindConsole/Controller/HarvestController.php b/module/VuFindConsole/src/VuFindConsole/Controller/HarvestController.php index dac8c25c1cc..8acab394ccb 100644 --- a/module/VuFindConsole/src/VuFindConsole/Controller/HarvestController.php +++ b/module/VuFindConsole/src/VuFindConsole/Controller/HarvestController.php @@ -26,7 +26,7 @@ * @link http://vufind.org/wiki/vufind2:building_a_controller Wiki */ namespace VuFindConsole\Controller; -use VuFind\Harvester\NAF, VuFind\Harvester\OAI, Zend\Console\Console; +use VuFind\Harvester\OAI, Zend\Console\Console; /** * This controller handles various command-line tools @@ -39,33 +39,6 @@ use VuFind\Harvester\NAF, VuFind\Harvester\OAI, Zend\Console\Console; */ class HarvestController extends AbstractBase { - /** - * Harvest the LC Name Authority File. - * - * @return \Zend\Console\Response - */ - public function harvestnafAction() - { - $this->checkLocalSetting(); - - // Perform the harvest. Note that first command line parameter - // may be used to start at a particular date. - try { - $harvest = new NAF( - $this->getServiceLocator()->get('VuFind\Http')->createClient() - ); - $argv = $this->consoleOpts->getRemainingArgs(); - if (isset($argv[0])) { - $harvest->setStartDate($argv[0]); - } - $harvest->launch(); - } catch (\Exception $e) { - Console::writeLine($e->getMessage()); - return $this->getFailureResponse(); - } - return $this->getSuccessResponse(); - } - /** * Harvest OAI-PMH records. * -- GitLab