diff --git a/composer.json b/composer.json index b4e6a4ee296925652e59459ee81857028467ee4f..eb5e8af8a0c85cacaf36a01c1e46f0644dd0b666 100644 --- a/composer.json +++ b/composer.json @@ -25,7 +25,7 @@ "symfony/yaml": "3.2.7", "swagger-api/swagger-ui": "2.2.10", "vufind-org/vufindcode": "1.0.3", - "vufind-org/vufindharvest": "2.2.0", + "vufind-org/vufindharvest": "2.3.0", "vufind-org/vufindhttp": "2.1.1", "yajra/laravel-pdo-via-oci8": "1.3.1", "zendframework/zend-cache": "2.7.2", diff --git a/harvest/oai.ini b/harvest/oai.ini index 54f494b1800d515d6b4cdf5058080ba248f83f20..7fc22feb9d6caef45b0ff5b3123db8588bea8b08 100644 --- a/harvest/oai.ini +++ b/harvest/oai.ini @@ -24,6 +24,7 @@ ; sslcafile = "/etc/pki/tls/cert.pem" ; e.g. for CentOS systems ; sslverifypeer = true ; sanitize = true +; sanitizeRegex[] = "/[^\x{0009}\x{000a}\x{000d}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}]+/u" ; badXMLLog = bad.log ; httpUser = myUsername ; httpPass = myPassword @@ -60,6 +61,11 @@ ; attributes to this tag, e.g., <collection attr="value"> will correctly ; wrap the records in <collection attr="value"></collection> tags. ; +; globalSearch[] and globalReplace[] may be used to manipulate the raw XML metadata +; documents with regular expressions. This should be used with caution but can be +; helpful when compensating for server-side encoding/markup errors. You may use +; multiple pairs of search and replace settings to perform multiple manipulations. +; ; idPrefix is the OAI-specific prefix attached to ID values. If you provide the ; value here, it will be automatically stripped for you when generating filenames, ; injecting IDs and tracking deleted records. If you omit the setting, full @@ -120,6 +126,10 @@ ; sanitize may be set to true to strip illegal characters from XML responses; it ; defaults to false, assuming that the OAI-PMH server you are harvesting from will ; provide you with valid data. +; +; sanitizeRegex may be set to an array of regex strings used to sanitize XML retrieved +; from an OAI-PMH source. Any text sequences matching these expressions will be +; replaced with blank spaces. ; ; badXMLLog may be set to a filename (which will be created within your harvest ; directory) to contain copies of bad XML that was fixed when the sanitize setting @@ -140,4 +150,8 @@ ;idSearch[] = "/\//" ;idReplace[] = "-" ;injectId = "identifier" -;injectDate = "datestamp" \ No newline at end of file +;injectDate = "datestamp" +; +; Further examples for harvesting OAI-PMH sources available to the general +; public may be found on this wiki page: +; https://vufind.org/wiki/indexing:open_data_sources