From ebef5a3d5f015c77cbbdc74846f6520c97c83373 Mon Sep 17 00:00:00 2001 From: Demian Katz <demian.katz@villanova.edu> Date: Thu, 6 Apr 2017 14:39:40 -0400 Subject: [PATCH] Upgrade vufindharvest. --- composer.json | 2 +- harvest/oai.ini | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/composer.json b/composer.json index b4e6a4ee296..eb5e8af8a0c 100644 --- a/composer.json +++ b/composer.json @@ -25,7 +25,7 @@ "symfony/yaml": "3.2.7", "swagger-api/swagger-ui": "2.2.10", "vufind-org/vufindcode": "1.0.3", - "vufind-org/vufindharvest": "2.2.0", + "vufind-org/vufindharvest": "2.3.0", "vufind-org/vufindhttp": "2.1.1", "yajra/laravel-pdo-via-oci8": "1.3.1", "zendframework/zend-cache": "2.7.2", diff --git a/harvest/oai.ini b/harvest/oai.ini index 54f494b1800..7fc22feb9d6 100644 --- a/harvest/oai.ini +++ b/harvest/oai.ini @@ -24,6 +24,7 @@ ; sslcafile = "/etc/pki/tls/cert.pem" ; e.g. for CentOS systems ; sslverifypeer = true ; sanitize = true +; sanitizeRegex[] = "/[^\x{0009}\x{000a}\x{000d}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}]+/u" ; badXMLLog = bad.log ; httpUser = myUsername ; httpPass = myPassword @@ -60,6 +61,11 @@ ; attributes to this tag, e.g., <collection attr="value"> will correctly ; wrap the records in <collection attr="value"></collection> tags. ; +; globalSearch[] and globalReplace[] may be used to manipulate the raw XML metadata +; documents with regular expressions. This should be used with caution but can be +; helpful when compensating for server-side encoding/markup errors. You may use +; multiple pairs of search and replace settings to perform multiple manipulations. +; ; idPrefix is the OAI-specific prefix attached to ID values. If you provide the ; value here, it will be automatically stripped for you when generating filenames, ; injecting IDs and tracking deleted records. If you omit the setting, full @@ -120,6 +126,10 @@ ; sanitize may be set to true to strip illegal characters from XML responses; it ; defaults to false, assuming that the OAI-PMH server you are harvesting from will ; provide you with valid data. +; +; sanitizeRegex may be set to an array of regex strings used to sanitize XML retrieved +; from an OAI-PMH source. Any text sequences matching these expressions will be +; replaced with blank spaces. ; ; badXMLLog may be set to a filename (which will be created within your harvest ; directory) to contain copies of bad XML that was fixed when the sanitize setting @@ -140,4 +150,8 @@ ;idSearch[] = "/\//" ;idReplace[] = "-" ;injectId = "identifier" -;injectDate = "datestamp" \ No newline at end of file +;injectDate = "datestamp" +; +; Further examples for harvesting OAI-PMH sources available to the general +; public may be found on this wiki page: +; https://vufind.org/wiki/indexing:open_data_sources -- GitLab