From 2d2637afa08485fb9c0cc2d8a7c0e3254a5e58a3 Mon Sep 17 00:00:00 2001 From: Demian Katz <demian.katz@villanova.edu> Date: Fri, 4 Oct 2013 14:20:46 -0400 Subject: [PATCH] Optional "sanitize" harvest setting. - Resolves VUFIND-731. - Thanks to Filipe Bento. --- harvest/oai.ini | 10 ++++ module/VuFind/src/VuFind/Harvester/OAI.php | 62 ++++++++++++++++++++++ 2 files changed, 72 insertions(+) diff --git a/harvest/oai.ini b/harvest/oai.ini index da519756eb8..f40c32c92fc 100644 --- a/harvest/oai.ini +++ b/harvest/oai.ini @@ -17,6 +17,8 @@ ; harvestedIdLog = harvest.log ; verbose = false ; sslverifypeer = true +; sanitize = true +; badXMLLog = bad.log ; ; The section_name may be passed to harvest_oai.php as a parameter to harvest only ; records from that source. This is also the directory name that records will be @@ -78,6 +80,14 @@ ; ; sslverifypeer may be set to false to disable SSL certificate checking; it defaults ; to true, and changing the setting is not recommended. +; +; sanitize may be set to true to strip illegal characters from XML responses; it +; defaults to false, assuming that the OAI-PMH server you are harvesting from will +; provide you with valid data. +; +; badXMLLog may be set to a filename (which will be created within your harvest +; directory) to contain copies of bad XML that was fixed when the sanitize setting +; (above) is set to true. ; SAMPLE CONFIGURATION FOR OPEN JOURNAL SYSTEMS ;[OJS] diff --git a/module/VuFind/src/VuFind/Harvester/OAI.php b/module/VuFind/src/VuFind/Harvester/OAI.php index 0fa942941ea..0130804097b 100644 --- a/module/VuFind/src/VuFind/Harvester/OAI.php +++ b/module/VuFind/src/VuFind/Harvester/OAI.php @@ -174,6 +174,20 @@ class OAI */ protected $verbose = false; + /** + * Should we sanitize XML? + * + * @var bool + */ + protected $sanitize = false; + + /** + * Filename for logging bad XML responses (false for none) + * + * @var string|bool + */ + protected $badXMLLog = false; + /** * As we harvest records, we want to track the most recent date encountered * so we can set a start point for the next harvest. (Unix timestamp format) @@ -257,6 +271,12 @@ class OAI if (isset($settings['verbose'])) { $this->verbose = $settings['verbose']; } + if (isset($settings['sanitize'])) { + $this->sanitize = $settings['sanitize']; + } + if (isset($settings['badXMLLog'])) { + $this->badXMLLog = $settings['badXMLLog']; + } if ($this->granularity == 'auto') { $this->loadGranularity(); } @@ -426,6 +446,43 @@ class OAI return $this->processResponse($result->getBody()); } + /** + * Log a bad XML response. + * + * @param string $xml Bad XML + * + * @return void + */ + protected function logBadXML($xml) + { + $file = fopen($this->basePath . $this->badXMLLog, 'a'); + if (!$file) { + throw new \Exception("Problem opening {$this->badXMLLog}."); + } + fputs($file, $xml . "\n\n"); + fclose($file); + } + + /** + * Sanitize XML. + * + * @param string $xml XML to sanitize + * + * @return string + */ + protected function sanitizeXML($xml) + { + // Sanitize the XML if requested: + $regex = '/[^\x{0009}\x{000a}\x{000d}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}]+/u'; + $newXML = trim(preg_replace($regex, ' ', $xml, -1, $count)); + + if ($count > 0 && $this->badXMLLog) { + $this->logBadXML($xml); + } + + return $newXML; + } + /** * Process an OAI-PMH response into a SimpleXML object. Die if an error is * detected. @@ -436,6 +493,11 @@ class OAI */ protected function processResponse($xml) { + // Sanitize if necessary: + if ($this->sanitize) { + $xml = $this->sanitizeXML($xml); + } + // Parse the XML: $result = simplexml_load_string($xml); if (!$result) { -- GitLab