Skip to content
Snippets Groups Projects
Commit 2d2637af authored by Demian Katz's avatar Demian Katz
Browse files

Optional "sanitize" harvest setting.

- Resolves VUFIND-731.
- Thanks to Filipe Bento.
parent b116aed1
No related merge requests found
...@@ -17,6 +17,8 @@ ...@@ -17,6 +17,8 @@
; harvestedIdLog = harvest.log ; harvestedIdLog = harvest.log
; verbose = false ; verbose = false
; sslverifypeer = true ; sslverifypeer = true
; sanitize = true
; badXMLLog = bad.log
; ;
; The section_name may be passed to harvest_oai.php as a parameter to harvest only ; The section_name may be passed to harvest_oai.php as a parameter to harvest only
; records from that source. This is also the directory name that records will be ; records from that source. This is also the directory name that records will be
...@@ -78,6 +80,14 @@ ...@@ -78,6 +80,14 @@
; ;
; sslverifypeer may be set to false to disable SSL certificate checking; it defaults ; sslverifypeer may be set to false to disable SSL certificate checking; it defaults
; to true, and changing the setting is not recommended. ; to true, and changing the setting is not recommended.
;
; sanitize may be set to true to strip illegal characters from XML responses; it
; defaults to false, assuming that the OAI-PMH server you are harvesting from will
; provide you with valid data.
;
; badXMLLog may be set to a filename (which will be created within your harvest
; directory) to contain copies of bad XML that was fixed when the sanitize setting
; (above) is set to true.
; SAMPLE CONFIGURATION FOR OPEN JOURNAL SYSTEMS ; SAMPLE CONFIGURATION FOR OPEN JOURNAL SYSTEMS
;[OJS] ;[OJS]
......
...@@ -174,6 +174,20 @@ class OAI ...@@ -174,6 +174,20 @@ class OAI
*/ */
protected $verbose = false; protected $verbose = false;
/**
* Should we sanitize XML?
*
* @var bool
*/
protected $sanitize = false;
/**
* Filename for logging bad XML responses (false for none)
*
* @var string|bool
*/
protected $badXMLLog = false;
/** /**
* As we harvest records, we want to track the most recent date encountered * As we harvest records, we want to track the most recent date encountered
* so we can set a start point for the next harvest. (Unix timestamp format) * so we can set a start point for the next harvest. (Unix timestamp format)
...@@ -257,6 +271,12 @@ class OAI ...@@ -257,6 +271,12 @@ class OAI
if (isset($settings['verbose'])) { if (isset($settings['verbose'])) {
$this->verbose = $settings['verbose']; $this->verbose = $settings['verbose'];
} }
if (isset($settings['sanitize'])) {
$this->sanitize = $settings['sanitize'];
}
if (isset($settings['badXMLLog'])) {
$this->badXMLLog = $settings['badXMLLog'];
}
if ($this->granularity == 'auto') { if ($this->granularity == 'auto') {
$this->loadGranularity(); $this->loadGranularity();
} }
...@@ -426,6 +446,43 @@ class OAI ...@@ -426,6 +446,43 @@ class OAI
return $this->processResponse($result->getBody()); return $this->processResponse($result->getBody());
} }
/**
* Log a bad XML response.
*
* @param string $xml Bad XML
*
* @return void
*/
protected function logBadXML($xml)
{
$file = fopen($this->basePath . $this->badXMLLog, 'a');
if (!$file) {
throw new \Exception("Problem opening {$this->badXMLLog}.");
}
fputs($file, $xml . "\n\n");
fclose($file);
}
/**
* Sanitize XML.
*
* @param string $xml XML to sanitize
*
* @return string
*/
protected function sanitizeXML($xml)
{
// Sanitize the XML if requested:
$regex = '/[^\x{0009}\x{000a}\x{000d}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}]+/u';
$newXML = trim(preg_replace($regex, ' ', $xml, -1, $count));
if ($count > 0 && $this->badXMLLog) {
$this->logBadXML($xml);
}
return $newXML;
}
/** /**
* Process an OAI-PMH response into a SimpleXML object. Die if an error is * Process an OAI-PMH response into a SimpleXML object. Die if an error is
* detected. * detected.
...@@ -436,6 +493,11 @@ class OAI ...@@ -436,6 +493,11 @@ class OAI
*/ */
protected function processResponse($xml) protected function processResponse($xml)
{ {
// Sanitize if necessary:
if ($this->sanitize) {
$xml = $this->sanitizeXML($xml);
}
// Parse the XML: // Parse the XML:
$result = simplexml_load_string($xml); $result = simplexml_load_string($xml);
if (!$result) { if (!$result) {
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment