diff --git a/import/index_java/src/org/solrmarc/index/VuFindIndexer.java b/import/index_java/src/org/solrmarc/index/VuFindIndexer.java deleted file mode 100644 index af72537dc64277873d1c950b5219d094eb2e6209..0000000000000000000000000000000000000000 --- a/import/index_java/src/org/solrmarc/index/VuFindIndexer.java +++ /dev/null @@ -1,2764 +0,0 @@ -package org.solrmarc.index; -/** - * Custom VuFind indexing routines. - * - * Copyright (C) Villanova University 2017. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -import java.io.File; -import java.io.FileReader; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.BufferedReader; -import java.io.InputStreamReader; -import java.io.OutputStreamWriter; -import java.lang.StringBuilder; -import java.text.ParseException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.HashMap; -import java.util.Iterator; -import java.util.LinkedHashSet; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Properties; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import java.util.Set; -import java.util.concurrent.ConcurrentHashMap; -import java.sql.*; -import java.text.SimpleDateFormat; - -import org.apache.log4j.Logger; -import org.marc4j.marc.ControlField; -import org.marc4j.marc.DataField; -import org.marc4j.marc.Record; -import org.marc4j.marc.Subfield; -import org.marc4j.marc.VariableField; -import org.solrmarc.callnum.DeweyCallNumber; -import org.solrmarc.callnum.LCCallNumber; -import org.solrmarc.tools.CallNumUtils; -import org.solrmarc.tools.SolrMarcIndexerException; -import org.solrmarc.tools.Utils; -import org.ini4j.Ini; - -import javax.xml.parsers.DocumentBuilder; -import javax.xml.parsers.DocumentBuilderFactory; - -import org.w3c.dom.Document; -import org.w3c.dom.Node; -import org.w3c.dom.NodeList; - -/** - * - * @author Robert Haschart - * @version $Id: VuFindIndexer.java 224 2008-11-05 19:33:21Z asnagy $ - * - */ -public class VuFindIndexer extends SolrIndexer -{ - // Initialize logging category - static Logger logger = Logger.getLogger(VuFindIndexer.class.getName()); - - // Initialize VuFind database connection (null until explicitly activated) - private Connection vufindDatabase = null; - private UpdateDateTracker tracker = null; - - // the SimpleDateFormat class is not Thread-safe the below line were changes to be not static - // which given the rest of the design of SolrMarc will make them work correctly. - private SimpleDateFormat marc005date = new SimpleDateFormat("yyyyMMddHHmmss.S"); - private SimpleDateFormat marc008date = new SimpleDateFormat("yyMMdd"); - - private static final Pattern COORDINATES_PATTERN = Pattern.compile("^([eEwWnNsS])(\\d{3})(\\d{2})(\\d{2})"); - private static final Pattern HDMSHDD_PATTERN = Pattern.compile("^([eEwWnNsS])(\\d+(\\.\\d+)?)"); - private static final Pattern PMDD_PATTERN = Pattern.compile("^([+-])(\\d+(\\.\\d+)?)"); - - private static ConcurrentHashMap<String, Ini> configCache = new ConcurrentHashMap<String, Ini>(); - private ConcurrentHashMap<String, String> relatorSynonymLookup = new ConcurrentHashMap<String, String>(); - private Set<String> knownRelators = new LinkedHashSet<String>(); - - // Shutdown flag: - private boolean shuttingDown = false; - - // VuFind-specific configs: - private Properties vuFindConfigs = null; - - /** - * Default constructor - * @param propertiesMapFile the {@code x_index.properties} file mapping solr - * field names to values in the marc records - * @param propertyDirs array of directories holding properties files - * @throws Exception if {@code SolrIndexer} constructor threw an exception. - */ - public VuFindIndexer(final String propertiesMapFile, final String[] propertyDirs) - throws FileNotFoundException, IOException, ParseException { - super(propertiesMapFile, propertyDirs); - try { - vuFindConfigs = Utils.loadProperties(propertyDirs, "vufind.properties"); - } catch (IllegalArgumentException e) { - // If the properties load failed, don't worry about it -- we'll use defaults. - } - } - - /** - * Log an error message and throw a fatal exception. - * @param msg message to log - */ - private void dieWithError(String msg) - { - logger.error(msg); - throw new SolrMarcIndexerException(SolrMarcIndexerException.EXIT, msg); - } - - /** - * Given the base name of a configuration file, locate the full path. - * @param filename base name of a configuration file - */ - private File findConfigFile(String filename) - { - // Find VuFind's home directory in the environment; if it's not available, - // try using a relative path on the assumption that we are currently in - // VuFind's import subdirectory: - String vufindHome = System.getenv("VUFIND_HOME"); - if (vufindHome == null) { - vufindHome = ".."; - } - - // Check for VuFind 2.0's local directory environment variable: - String vufindLocal = System.getenv("VUFIND_LOCAL_DIR"); - - // Get the relative VuFind path from the properties file, defaulting to - // the 2.0-style config/vufind if necessary. - String relativeConfigPath = Utils.getProperty( - vuFindConfigs, "vufind.config.relative_path", "config/vufind" - ); - - // Try several different locations for the file -- VuFind 2 local dir, - // VuFind 2 base dir, VuFind 1 base dir. - File file; - if (vufindLocal != null) { - file = new File(vufindLocal + "/" + relativeConfigPath + "/" + filename); - if (file.exists()) { - return file; - } - } - file = new File(vufindHome + "/" + relativeConfigPath + "/" + filename); - if (file.exists()) { - return file; - } - file = new File(vufindHome + "/web/conf/" + filename); - return file; - } - - /** - * Sanitize a VuFind configuration setting. - * @param str configuration setting - */ - private String sanitizeConfigSetting(String str) - { - // Drop comments if necessary: - int pos = str.indexOf(';'); - if (pos >= 0) { - str = str.substring(0, pos).trim(); - } - - // Strip wrapping quotes if necessary (the ini reader won't do this for us): - if (str.startsWith("\"")) { - str = str.substring(1, str.length()); - } - if (str.endsWith("\"")) { - str = str.substring(0, str.length() - 1); - } - return str; - } - - /** - * Load an ini file. - * @param filename name of {@code .ini} file - */ - public Ini loadConfigFile(String filename) - { - // Retrieve the file if it is not already cached. - if (!configCache.containsKey(filename)) { - Ini ini = new Ini(); - try { - ini.load(new FileReader(findConfigFile(filename))); - configCache.putIfAbsent(filename, ini); - } catch (Throwable e) { - dieWithError("Unable to access " + filename); - } - } - return configCache.get(filename); - } - - /** - * Get a section from a VuFind configuration file. - * @param filename configuration file name - * @param section section name within the file - */ - public Map<String, String> getConfigSection(String filename, String section) - { - // Grab the ini file. - Ini ini = loadConfigFile(filename); - Map<String, String> retVal = ini.get(section); - - String parent = ini.get("Parent_Config", "path"); - while (parent != null) { - Ini parentIni = loadConfigFile(parent); - Map<String, String> parentSection = parentIni.get(section); - for (String key : parentSection.keySet()) { - if (!retVal.containsKey(key)) { - retVal.put(key, parentSection.get(key)); - } - } - parent = parentIni.get("Parent_Config", "path"); - } - - // Check to see if we need to worry about an override file: - String override = ini.get("Extra_Config", "local_overrides"); - if (override != null) { - Map<String, String> overrideSection = loadConfigFile(override).get(section); - for (String key : overrideSection.keySet()) { - retVal.put(key, overrideSection.get(key)); - } - } - return retVal; - } - - /** - * Get a setting from a VuFind configuration file. - * @param filename configuration file name - * @param section section name within the file - * @param setting setting name within the section - */ - public String getConfigSetting(String filename, String section, String setting) - { - String retVal = null; - - // Grab the ini file. - Ini ini = loadConfigFile(filename); - - // Check to see if we need to worry about an override file: - String override = ini.get("Extra_Config", "local_overrides"); - if (override != null) { - Ini overrideIni = loadConfigFile(override); - retVal = overrideIni.get(section, setting); - if (retVal != null) { - return sanitizeConfigSetting(retVal); - } - } - - // Try to find the requested setting: - retVal = ini.get(section, setting); - - // No setting? Check for a parent configuration: - while (retVal == null) { - String parent = ini.get("Parent_Config", "path"); - if (parent != null) { - try { - ini.load(new FileReader(new File(parent))); - } catch (Throwable e) { - dieWithError("Unable to access " + parent); - } - retVal = ini.get(section, setting); - } else { - break; - } - } - - // Return the processed setting: - return retVal == null ? null : sanitizeConfigSetting(retVal); - } - - /** - * Connect to the VuFind database if we do not already have a connection. - */ - private void connectToDatabase() - { - // Already connected? Do nothing further! - if (vufindDatabase != null) { - return; - } - - String dsn = getConfigSetting("config.ini", "Database", "database"); - - try { - // Parse key settings from the PHP-style DSN: - String username = ""; - String password = ""; - String classname = "invalid"; - String prefix = "invalid"; - if (dsn.substring(0, 8).equals("mysql://")) { - classname = "com.mysql.jdbc.Driver"; - prefix = "mysql"; - } else if (dsn.substring(0, 8).equals("pgsql://")) { - classname = "org.postgresql.Driver"; - prefix = "postgresql"; - } - - Class.forName(classname).newInstance(); - String[] parts = dsn.split("://"); - if (parts.length > 1) { - parts = parts[1].split("@"); - if (parts.length > 1) { - dsn = prefix + "://" + parts[1]; - parts = parts[0].split(":"); - username = parts[0]; - if (parts.length > 1) { - password = parts[1]; - } - } - } - - // Connect to the database: - vufindDatabase = DriverManager.getConnection("jdbc:" + dsn, username, password); - } catch (Throwable e) { - dieWithError("Unable to connect to VuFind database"); - } - - Runtime.getRuntime().addShutdownHook(new VuFindShutdownThread(this)); - } - - private void disconnectFromDatabase() - { - if (vufindDatabase != null) { - try { - vufindDatabase.close(); - } catch (SQLException e) { - System.err.println("Unable to disconnect from VuFind database"); - logger.error("Unable to disconnect from VuFind database"); - } - } - } - - public void shutdown() - { - disconnectFromDatabase(); - shuttingDown = true; - } - - class VuFindShutdownThread extends Thread - { - private VuFindIndexer indexer; - - public VuFindShutdownThread(VuFindIndexer i) - { - indexer = i; - } - - public void run() - { - indexer.shutdown(); - } - } - - /** - * Establish UpdateDateTracker object if not already available. - */ - private void loadUpdateDateTracker() throws java.sql.SQLException - { - if (tracker == null) { - connectToDatabase(); - tracker = new UpdateDateTracker(vufindDatabase); - } - } - - /** - * Support method for getLatestTransaction. - * @return Date extracted from 005 (or very old date, if unavailable) - */ - private java.util.Date normalize005Date(String input) - { - // Normalize "null" strings to a generic bad value: - if (input == null) { - input = "null"; - } - - // Try to parse the date; default to "millisecond 0" (very old date) if we can't - // parse the data successfully. - java.util.Date retVal; - try { - retVal = marc005date.parse(input); - } catch(java.text.ParseException e) { - retVal = new java.util.Date(0); - } - return retVal; - } - - /** - * Support method for getLatestTransaction. - * @return Date extracted from 008 (or very old date, if unavailable) - */ - private java.util.Date normalize008Date(String input) - { - // Normalize "null" strings to a generic bad value: - if (input == null || input.length() < 6) { - input = "null"; - } - - // Try to parse the date; default to "millisecond 0" (very old date) if we can't - // parse the data successfully. - java.util.Date retVal; - try { - retVal = marc008date.parse(input.substring(0, 6)); - } catch(java.lang.StringIndexOutOfBoundsException e) { - retVal = new java.util.Date(0); - } catch(java.text.ParseException e) { - retVal = new java.util.Date(0); - } - return retVal; - } - - /** - * Extract the latest transaction date from the MARC record. This is useful - * for detecting when a record has changed since the last time it was indexed. - * - * @param record MARC record - * @return Latest transaction date. - */ - public java.util.Date getLatestTransaction(Record record) { - // First try the 005 -- this is most likely to have a precise transaction date: - Set<String> dates = getFieldList(record, "005"); - if (dates != null) { - Iterator<String> dateIter = dates.iterator(); - if (dateIter.hasNext()) { - return normalize005Date(dateIter.next()); - } - } - - // No luck with 005? Try 008 next -- less precise, but better than nothing: - dates = getFieldList(record, "008"); - if (dates != null) { - Iterator<String> dateIter = dates.iterator(); - if (dateIter.hasNext()) { - return normalize008Date(dateIter.next()); - } - } - - // If we got this far, we couldn't find a valid value; return an arbitrary date: - return new java.util.Date(0); - } - - /** - * Get all available publishers from the record. - * - * @param record MARC record - * @return set of publishers - */ - public Set<String> getPublishers(final Record record) { - Set<String> publishers = new LinkedHashSet<String>(); - - // First check old-style 260b name: - List<VariableField> list260 = record.getVariableFields("260"); - for (VariableField vf : list260) - { - DataField df = (DataField) vf; - String currentString = ""; - for (Subfield current : df.getSubfields('b')) { - currentString = currentString.trim().concat(" " + current.getData()).trim(); - } - if (currentString.length() > 0) { - publishers.add(currentString); - } - } - - // Now track down relevant RDA-style 264b names; we only care about - // copyright and publication names (and ignore copyright names if - // publication names are present). - Set<String> pubNames = new LinkedHashSet<String>(); - Set<String> copyNames = new LinkedHashSet<String>(); - List<VariableField> list264 = record.getVariableFields("264"); - for (VariableField vf : list264) - { - DataField df = (DataField) vf; - String currentString = ""; - for (Subfield current : df.getSubfields('b')) { - currentString = currentString.trim().concat(" " + current.getData()).trim(); - } - if (currentString.length() > 0) { - char ind2 = df.getIndicator2(); - switch (ind2) - { - case '1': - pubNames.add(currentString); - break; - case '4': - copyNames.add(currentString); - break; - } - } - } - if (pubNames.size() > 0) { - publishers.addAll(pubNames); - } else if (copyNames.size() > 0) { - publishers.addAll(copyNames); - } - - return publishers; - } - - /** - * Get all available dates from the record. - * - * @param record MARC record - * @return set of dates - */ - public Set<String> getDates(final Record record) { - Set<String> dates = new LinkedHashSet<String>(); - - // First check old-style 260c date: - List<VariableField> list260 = record.getVariableFields("260"); - for (VariableField vf : list260) { - DataField df = (DataField) vf; - List<Subfield> currentDates = df.getSubfields('c'); - for (Subfield sf : currentDates) { - String currentDateStr = Utils.cleanDate(sf.getData()); - if (currentDateStr != null) dates.add(currentDateStr); - } - } - - // Now track down relevant RDA-style 264c dates; we only care about - // copyright and publication dates (and ignore copyright dates if - // publication dates are present). - Set<String> pubDates = new LinkedHashSet<String>(); - Set<String> copyDates = new LinkedHashSet<String>(); - List<VariableField> list264 = record.getVariableFields("264"); - for (VariableField vf : list264) { - DataField df = (DataField) vf; - List<Subfield> currentDates = df.getSubfields('c'); - for (Subfield sf : currentDates) { - String currentDateStr = Utils.cleanDate(sf.getData()); - char ind2 = df.getIndicator2(); - switch (ind2) - { - case '1': - if (currentDateStr != null) pubDates.add(currentDateStr); - break; - case '4': - if (currentDateStr != null) copyDates.add(currentDateStr); - break; - } - } - } - if (pubDates.size() > 0) { - dates.addAll(pubDates); - } else if (copyDates.size() > 0) { - dates.addAll(copyDates); - } - - return dates; - } - - /** - * Get the earliest publication date from the record. - * - * @param record MARC record - * @return earliest date - */ - public String getFirstDate(final Record record) { - String result = null; - Set<String> dates = getDates(record); - for(String current: dates) { - if (result == null || Integer.parseInt(current) < Integer.parseInt(result)) { - result = current; - } - } - return result; - } - - /** - * Determine Record Format(s) - * - * @param record MARC record - * @return set of record formats - */ - public Set<String> getFormat(final Record record){ - Set<String> result = new LinkedHashSet<String>(); - String leader = record.getLeader().toString(); - char leaderBit; - ControlField fixedField = (ControlField) record.getVariableField("008"); - DataField title = (DataField) record.getVariableField("245"); - String formatString; - char formatCode = ' '; - char formatCode2 = ' '; - char formatCode4 = ' '; - - // check if there's an h in the 245 - if (title != null) { - if (title.getSubfield('h') != null){ - if (title.getSubfield('h').getData().toLowerCase().contains("[electronic resource]")) { - result.add("Electronic"); - return result; - } - } - } - - // check the 007 - this is a repeating field - List<VariableField> fields = record.getVariableFields("007"); - Iterator<VariableField> fieldsIter = fields.iterator(); - if (fields != null) { - // TODO: update loop to for(:) syntax, but problem with type casting. - ControlField formatField; - while(fieldsIter.hasNext()) { - formatField = (ControlField) fieldsIter.next(); - formatString = formatField.getData().toUpperCase(); - formatCode = formatString.length() > 0 ? formatString.charAt(0) : ' '; - formatCode2 = formatString.length() > 1 ? formatString.charAt(1) : ' '; - formatCode4 = formatString.length() > 4 ? formatString.charAt(4) : ' '; - switch (formatCode) { - case 'A': - switch(formatCode2) { - case 'D': - result.add("Atlas"); - break; - default: - result.add("Map"); - break; - } - break; - case 'C': - switch(formatCode2) { - case 'A': - result.add("TapeCartridge"); - break; - case 'B': - result.add("ChipCartridge"); - break; - case 'C': - result.add("DiscCartridge"); - break; - case 'F': - result.add("TapeCassette"); - break; - case 'H': - result.add("TapeReel"); - break; - case 'J': - result.add("FloppyDisk"); - break; - case 'M': - case 'O': - result.add("CDROM"); - break; - case 'R': - // Do not return - this will cause anything with an - // 856 field to be labeled as "Electronic" - break; - default: - result.add("Software"); - break; - } - break; - case 'D': - result.add("Globe"); - break; - case 'F': - result.add("Braille"); - break; - case 'G': - switch(formatCode2) { - case 'C': - case 'D': - result.add("Filmstrip"); - break; - case 'T': - result.add("Transparency"); - break; - default: - result.add("Slide"); - break; - } - break; - case 'H': - result.add("Microfilm"); - break; - case 'K': - switch(formatCode2) { - case 'C': - result.add("Collage"); - break; - case 'D': - result.add("Drawing"); - break; - case 'E': - result.add("Painting"); - break; - case 'F': - result.add("Print"); - break; - case 'G': - result.add("Photonegative"); - break; - case 'J': - result.add("Print"); - break; - case 'L': - result.add("Drawing"); - break; - case 'O': - result.add("FlashCard"); - break; - case 'N': - result.add("Chart"); - break; - default: - result.add("Photo"); - break; - } - break; - case 'M': - switch(formatCode2) { - case 'F': - result.add("VideoCassette"); - break; - case 'R': - result.add("Filmstrip"); - break; - default: - result.add("MotionPicture"); - break; - } - break; - case 'O': - result.add("Kit"); - break; - case 'Q': - result.add("MusicalScore"); - break; - case 'R': - result.add("SensorImage"); - break; - case 'S': - switch(formatCode2) { - case 'D': - result.add("SoundDisc"); - break; - case 'S': - result.add("SoundCassette"); - break; - default: - result.add("SoundRecording"); - break; - } - break; - case 'V': - switch(formatCode2) { - case 'C': - result.add("VideoCartridge"); - break; - case 'D': - switch(formatCode4) { - case 'S': - result.add("BRDisc"); - break; - case 'V': - default: - result.add("VideoDisc"); - break; - } - break; - case 'F': - result.add("VideoCassette"); - break; - case 'R': - result.add("VideoReel"); - break; - default: - result.add("Video"); - break; - } - break; - } - } - if (!result.isEmpty()) { - return result; - } - } - - // check the Leader at position 6 - leaderBit = leader.charAt(6); - switch (Character.toUpperCase(leaderBit)) { - case 'C': - case 'D': - result.add("MusicalScore"); - break; - case 'E': - case 'F': - result.add("Map"); - break; - case 'G': - result.add("Slide"); - break; - case 'I': - result.add("SoundRecording"); - break; - case 'J': - result.add("MusicRecording"); - break; - case 'K': - result.add("Photo"); - break; - case 'M': - result.add("Electronic"); - break; - case 'O': - case 'P': - result.add("Kit"); - break; - case 'R': - result.add("PhysicalObject"); - break; - case 'T': - result.add("Manuscript"); - break; - } - if (!result.isEmpty()) { - return result; - } - - // check the Leader at position 7 - leaderBit = leader.charAt(7); - switch (Character.toUpperCase(leaderBit)) { - // Monograph - case 'M': - if (formatCode == 'C') { - result.add("eBook"); - } else { - result.add("Book"); - } - break; - // Component parts - case 'A': - result.add("BookComponentPart"); - break; - case 'B': - result.add("SerialComponentPart"); - break; - // Serial - case 'S': - // Look in 008 to determine what type of Continuing Resource - formatCode = fixedField.getData().toUpperCase().charAt(21); - switch (formatCode) { - case 'N': - result.add("Newspaper"); - break; - case 'P': - result.add("Journal"); - break; - default: - result.add("Serial"); - break; - } - } - - // Nothing worked! - if (result.isEmpty()) { - result.add("Unknown"); - } - - return result; - } - - /** - * Get call numbers of a specific type. - * - * <p>{@code fieldSpec} is of form {@literal 098abc:099ab}, does not accept subfield ranges. - * - * - * @param record current MARC record - * @param fieldSpec which MARC fields / subfields need to be analyzed - * @param callTypeSf subfield containing call number type, single character only - * @param callType literal call number code - * @param result a collection to gather the call numbers - * @return collection of call numbers, same object as {@code result} - */ - public static Collection<String> getCallNumberByTypeCollector( - Record record, String fieldSpec, String callTypeSf, String callType, Collection<String> result) { - for (String tag : fieldSpec.split(":")) { - // Check to ensure tag length is at least 3 characters - if (tag.length() < 3) { - //TODO: Should this go to a log? Better message for a bad tag in a field spec? - System.err.println("Invalid tag specified: " + tag); - continue; - } - String dfTag = tag.substring(0, 3); - String sfSpec = null; - if (tag.length() > 3) { - sfSpec = tag.substring(3); - } - - // do all fields for this tag - for (VariableField vf : record.getVariableFields(dfTag)) { - // Assume tag represents a DataField - DataField df = (DataField) vf; - boolean callTypeMatch = false; - - // Assume call type subfield could repeat - for (Subfield typeSf : df.getSubfields(callTypeSf)) { - if (callTypeSf.indexOf(typeSf.getCode()) != -1 && typeSf.getData().equals(callType)) { - callTypeMatch = true; - } - } - System.err.println("callTypeMatch after loop: " + callTypeMatch); - if (callTypeMatch) { - result.add(df.getSubfieldsAsString(sfSpec)); - } - } // end loop over variable fields - } // end loop over fieldSpec - return result; - } - - - /** - * Get call numbers of a specific type. - * - * <p>{@code fieldSpec} is of form {@literal 098abc:099ab}, does not accept subfield ranges. - * - * @param record current MARC record - * @param fieldSpec which MARC fields / subfields need to be analyzed - * @param callTypeSf subfield containing call number type, single character only - * @param callType literal call number code - * @return set of call numbers - */ - public static Set<String> getCallNumberByType(Record record, String fieldSpec, String callTypeSf, String callType) { - return (Set<String>) getCallNumberByTypeCollector(record, fieldSpec, callTypeSf, callType, - new LinkedHashSet<String>()); - } - - /** - * Get call numbers of a specific type. - * - * <p>{@code fieldSpec} is of form {@literal 098abc:099ab}, does not accept subfield ranges. - * - * @param record current MARC record - * @param fieldSpec which MARC fields / subfields need to be analyzed - * @param callTypeSf subfield containing call number type, single character only - * @param callType literal call number code - * @return list of call numbers - */ - public static List<String> getCallNumberByTypeAsList(Record record, String fieldSpec, String callTypeSf, String callType) { - return (List<String>) getCallNumberByTypeCollector(record, fieldSpec, callTypeSf, callType, - new ArrayList<String>()); - } - - /** - * Extract the full call number from a record, stripped of spaces - * @param record MARC record - * @return Call number label - * @deprecated Obsolete as of VuFind 2.4. - * This method exists only to support the VuFind call number search, version <= 2.3. - * As of VuFind 2.4, the munging for call number search in handled entirely in Solr. - */ - @Deprecated - public String getFullCallNumber(final Record record) { - - return(getFullCallNumber(record, "099ab:090ab:050ab")); - } - - /** - * Extract the full call number from a record, stripped of spaces - * @param record MARC record - * @param fieldSpec taglist for call number fields - * @return Call number label - * @deprecated Obsolete as of VuFind 2.4. - * This method exists only to support the VuFind call number search, version <= 2.3. - * As of VuFind 2.4, the munging for call number search in handled entirely in Solr. - */ - @Deprecated - public String getFullCallNumber(final Record record, String fieldSpec) { - - String val = getFirstFieldVal(record, fieldSpec); - - if (val != null) { - return val.toUpperCase().replaceAll(" ", ""); - } else { - return val; - } - } - - /** - * Extract the call number label from a record - * @param record MARC record - * @return Call number label - */ - public String getCallNumberLabel(final Record record) { - - return getCallNumberLabel(record, "090a:050a"); - } - - /** - * Extract the call number label from a record - * @param record MARC record - * @param fieldSpec taglist for call number fields - * @return Call number label - */ - public String getCallNumberLabel(final Record record, String fieldSpec) { - - String val = getFirstFieldVal(record, fieldSpec); - - if (val != null) { - int dotPos = val.indexOf("."); - if (dotPos > 0) { - val = val.substring(0, dotPos); - } - return val.toUpperCase(); - } else { - return val; - } - } - - /** - * Extract the subject component of the call number - * - * Can return null - * - * @param record MARC record - * @return Call number subject letters - */ - public String getCallNumberSubject(final Record record) { - - return(getCallNumberSubject(record, "090a:050a")); - } - - /** - * Extract the subject component of the call number - * - * Can return null - * - * @param record current MARC record - * @return Call number subject letters - */ - public String getCallNumberSubject(final Record record, String fieldSpec) { - - String val = getFirstFieldVal(record, fieldSpec); - - if (val != null) { - String [] callNumberSubject = val.toUpperCase().split("[^A-Z]+"); - if (callNumberSubject.length > 0) - { - return callNumberSubject[0]; - } - } - return(null); - } - - /** - * Normalize a single LC call number - * @param record current MARC record - * @return String Normalized LCCN - */ - public String getFullCallNumberNormalized(final Record record) { - - return(getFullCallNumberNormalized(record, "099ab:090ab:050ab")); - } - - /** - * Normalize a single LC call number - * @param record current MARC record - * @param fieldSpec which MARC fields / subfields need to be analyzed - * @return String Normalized LC call number - */ - public String getFullCallNumberNormalized(final Record record, String fieldSpec) { - - // TODO: is the null fieldSpec still an issue? - if (fieldSpec != null) { - String cn = getFirstFieldVal(record, fieldSpec); - return (new LCCallNumber(cn)).getShelfKey(); - } - // If we got this far, we couldn't find a valid value: - return null; - } - - /** - * Determine if a record is illustrated. - * - * @param LC call number - * @return "Illustrated" or "Not Illustrated" - */ - public String isIllustrated(Record record) { - String leader = record.getLeader().toString(); - - // Does the leader indicate this is a "language material" that might have extra - // illustration details in the fixed fields? - if (leader.charAt(6) == 'a') { - String currentCode = ""; // for use in loops below - - // List of 008/18-21 codes that indicate illustrations: - String illusCodes = "abcdefghijklmop"; - - // Check the illustration characters of the 008: - ControlField fixedField = (ControlField) record.getVariableField("008"); - if (fixedField != null) { - String fixedFieldText = fixedField.getData().toLowerCase(); - for (int i = 18; i <= 21; i++) { - if (i < fixedFieldText.length()) { - currentCode = fixedFieldText.substring(i, i + 1); - if (illusCodes.contains(currentCode)) { - return "Illustrated"; - } - } - } - } - - // Now check if any 006 fields apply: - List<VariableField> fields = record.getVariableFields("006"); - Iterator<VariableField> fieldsIter = fields.iterator(); - if (fields != null) { - while(fieldsIter.hasNext()) { - fixedField = (ControlField) fieldsIter.next(); - String fixedFieldText = fixedField.getData().toLowerCase(); - for (int i = 1; i <= 4; i++) { - if (i < fixedFieldText.length()) { - currentCode = fixedFieldText.substring(i, i + 1); - if (illusCodes.contains(currentCode)) { - return "Illustrated"; - } - } - } - } - } - } - - // Now check for interesting strings in 300 subfield b: - List<VariableField> fields = record.getVariableFields("300"); - Iterator<VariableField> fieldsIter = fields.iterator(); - if (fields != null) { - DataField physical; - while(fieldsIter.hasNext()) { - physical = (DataField) fieldsIter.next(); - List<Subfield> subfields = physical.getSubfields('b'); - for (Subfield sf: subfields) { - String desc = sf.getData().toLowerCase(); - if (desc.contains("ill.") || desc.contains("illus.")) { - return "Illustrated"; - } - } - } - } - - // If we made it this far, we found no sign of illustrations: - return "Not Illustrated"; - } - - - /** - * Normalize LC numbers for sorting purposes (use only the first valid number!). - * Will return first call number found if none pass validation, - * or empty string if no call numbers. - * - * @param record current MARC record - * @param fieldSpec which MARC fields / subfields need to be analyzed - * @return sortable shelf key of the first valid LC number encountered, - * otherwise shelf key of the first call number found. - */ - public String getLCSortable(Record record, String fieldSpec) { - // Loop through the specified MARC fields: - Set<String> input = getFieldList(record, fieldSpec); - String firstCall = ""; - for (String current : input) { - // If this is a valid LC number, return the sortable shelf key: - LCCallNumber callNum = new LCCallNumber(current); - if (callNum.isValid()) { - return callNum.getShelfKey(); // RETURN first valid - } - if (firstCall.length() == 0) { - firstCall = current; - } - } - - // If we made it this far, did not find a valid LC number, so use what we have: - return new LCCallNumber(firstCall).getShelfKey(); - } - - /** - * Get sort key for first LC call number, identified by call type. - * - * <p>{@code fieldSpec} is of form {@literal 098abc:099ab}, does not accept subfield ranges. - * - * - * @param record current MARC record - * @param fieldSpec which MARC fields / subfields need to be analyzed - * @param callTypeSf subfield containing call number type, single character only - * @param callType literal call number code - * @return sort key for first identified LC call number - */ - public String getLCSortableByType( - Record record, String fieldSpec, String callTypeSf, String callType) { - String sortKey = null; - for (String tag : fieldSpec.split(":")) { - // Check to ensure tag length is at least 3 characters - if (tag.length() < 3) { - //TODO: Should this go to a log? Better message for a bad tag in a field spec? - System.err.println("Invalid tag specified: " + tag); - continue; - } - String dfTag = tag.substring(0, 3); - String sfSpec = null; - if (tag.length() > 3) { - sfSpec = tag.substring(3); - } - - // do all fields for this tag - for (VariableField vf : record.getVariableFields(dfTag)) { - // Assume tag represents a DataField - DataField df = (DataField) vf; - boolean callTypeMatch = false; - - // Assume call type subfield could repeat - for (Subfield typeSf : df.getSubfields(callTypeSf)) { - if (callTypeSf.indexOf(typeSf.getCode()) != -1 && typeSf.getData().equals(callType)) { - callTypeMatch = true; - } - } - // take the first call number coded as LC - if (callTypeMatch) { - sortKey = new LCCallNumber(df.getSubfieldsAsString(sfSpec)).getShelfKey(); - break; - } - } // end loop over variable fields - } // end loop over fieldSpec - return sortKey; - } - - /** - * Extract a numeric portion of the Dewey decimal call number - * - * Can return null - * - * @param record current MARC record - * @param fieldSpec which MARC fields / subfields need to be analyzed - * @param precisionStr a decimal number (represented in string format) showing the - * desired precision of the returned number; i.e. 100 to round to nearest hundred, - * 10 to round to nearest ten, 0.1 to round to nearest tenth, etc. - * @return Set containing requested numeric portions of Dewey decimal call numbers - */ - public Set<String> getDeweyNumber(Record record, String fieldSpec, String precisionStr) { - // Initialize our return value: - Set<String> result = new LinkedHashSet<String>(); - - // Precision comes in as a string, but we need to convert it to a float: - float precision = Float.parseFloat(precisionStr); - - // Loop through the specified MARC fields: - Set<String> input = getFieldList(record, fieldSpec); - for (String current: input) { - DeweyCallNumber callNum = new DeweyCallNumber(current); - if (callNum.isValid()) { - // Convert the numeric portion of the call number into a float: - float currentVal = Float.parseFloat(callNum.getClassification()); - - // Round the call number value to the specified precision: - Float finalVal = new Float(Math.floor(currentVal / precision) * precision); - - // Convert the rounded value back to a string (with leading zeros) and save it: - // TODO: Provide different conversion to remove CallNumUtils dependency - result.add(CallNumUtils.normalizeFloat(finalVal.toString(), 3, -1)); - } - } - - // If we found no call number matches, return null; otherwise, return our results: - if (result.isEmpty()) - return null; - return result; - } - - /** - * Normalize Dewey numbers for searching purposes (uppercase/stripped spaces) - * - * Can return null - * - * @param record current MARC record - * @param fieldSpec which MARC fields / subfields need to be analyzed - * @return Set containing normalized Dewey numbers extracted from specified fields. - */ - public Set<String> getDeweySearchable(Record record, String fieldSpec) { - // Initialize our return value: - Set<String> result = new LinkedHashSet<String>(); - - // Loop through the specified MARC fields: - Set<String> input = getFieldList(record, fieldSpec); - Iterator<String> iter = input.iterator(); - while (iter.hasNext()) { - // Get the current string to work on: - String current = iter.next(); - - // Add valid strings to the set, normalizing them to be all uppercase - // and free from whitespace. - DeweyCallNumber callNum = new DeweyCallNumber(current); - if (callNum.isValid()) { - result.add(callNum.toString().toUpperCase().replaceAll(" ", "")); - } - } - - // If we found no call numbers, return null; otherwise, return our results: - if (result.isEmpty()) - return null; - return result; - } - - /** - * Normalize Dewey numbers for sorting purposes (use only the first valid number!) - * - * Can return null - * - * @param record current MARC record - * @param fieldSpec which MARC fields / subfields need to be analyzed - * @return String containing the first valid Dewey number encountered, normalized - * for sorting purposes. - */ - public String getDeweySortable(Record record, String fieldSpec) { - // Loop through the specified MARC fields: - Set<String> input = getFieldList(record, fieldSpec); - Iterator<String> iter = input.iterator(); - while (iter.hasNext()) { - // Get the current string to work on: - String current = iter.next(); - - // If this is a valid Dewey number, return the sortable shelf key: - DeweyCallNumber callNum = new DeweyCallNumber(current); - if (callNum.isValid()) { - return callNum.getShelfKey(); - } - } - - // If we made it this far, we didn't find a valid sortable Dewey number: - return null; - } - - /** - * Get sort key for first Dewey call number, identified by call type. - * - * <p>{@code fieldSpec} is of form {@literal 098abc:099ab}, does not accept subfield ranges. - * - * - * @param record current MARC record - * @param fieldSpec which MARC fields / subfields need to be analyzed - * @param callTypeSf subfield containing call number type, single character only - * @param callType literal call number code - * @return sort key for first identified Dewey call number - */ - public static String getDeweySortableByType( - Record record, String fieldSpec, String callTypeSf, String callType) { - String sortKey = null; - for (String tag : fieldSpec.split(":")) { - // Check to ensure tag length is at least 3 characters - if (tag.length() < 3) { - //TODO: Should this go to a log? Better message for a bad tag in a field spec? - System.err.println("Invalid tag specified: " + tag); - continue; - } - String dfTag = tag.substring(0, 3); - String sfSpec = null; - if (tag.length() > 3) { - sfSpec = tag.substring(3); - } - - // do all fields for this tag - for (VariableField vf : record.getVariableFields(dfTag)) { - // Assume tag represents a DataField - DataField df = (DataField) vf; - boolean callTypeMatch = false; - - // Assume call type subfield could repeat - for (Subfield typeSf : df.getSubfields(callTypeSf)) { - if (callTypeSf.indexOf(typeSf.getCode()) != -1 && typeSf.getData().equals(callType)) { - callTypeMatch = true; - } - } - // take the first call number coded as Dewey - if (callTypeMatch) { - sortKey = new DeweyCallNumber(df.getSubfieldsAsString(sfSpec)).getShelfKey(); - break; - } - } // end loop over variable fields - } // end loop over fieldSpec - return sortKey; - } - - - /** - * Normalize Dewey numbers for AlphaBrowse sorting purposes (use all numbers!) - * - * Can return null - * - * @param record current MARC record - * @param fieldSpec which MARC fields / subfields need to be analyzed - * @return List containing normalized Dewey numbers extracted from specified fields. - */ - public List<String> getDeweySortables(Record record, String fieldSpec) { - // Initialize our return value: - List<String> result = new LinkedList<String>(); - - // Loop through the specified MARC fields: - Set<String> input = getFieldList(record, fieldSpec); - Iterator<String> iter = input.iterator(); - while (iter.hasNext()) { - // Get the current string to work on: - String current = iter.next(); - - // gather all sort keys, even if number is not valid - DeweyCallNumber callNum = new DeweyCallNumber(current); - result.add(callNum.getShelfKey()); - } - - // If we found no call numbers, return null; otherwise, return our results: - if (result.isEmpty()) - return null; - return result; - } - - /** - * The following several methods are designed to get latitude and longitude - * coordinates. - * Records can have multiple coordinates sets of points and/or rectangles. - * Points are represented by coordinate sets where N=S E=W. - * - * code adapted from xrosecky - Moravian Library - * https://github.com/moravianlibrary/VuFind-2.x/blob/master/import/index_scripts/geo.bsh - * and incorporates VuFind location.bsh functionality for GoogleMap display. - */ - - /** - * Convert MARC coordinates into location_geo format. - * - * @param Record record - * @return List geo_coordinates - */ - public List<String> getAllCoordinates(Record record) { - List<String> geo_coordinates = new ArrayList<String>(); - List<VariableField> list034 = record.getVariableFields("034"); - if (list034 != null) { - for (VariableField vf : list034) { - DataField df = (DataField) vf; - String d = df.getSubfield('d').getData(); - String e = df.getSubfield('e').getData(); - String f = df.getSubfield('f').getData(); - String g = df.getSubfield('g').getData(); - //System.out.println("raw Coords: "+d+" "+e+" "+f+" "+g); - - // Check to see if there are only 2 coordinates - // If so, copy them into the corresponding coordinate fields - if ((d !=null && (e == null || e.trim().equals(""))) && (f != null && (g==null || g.trim().equals("")))) { - e = d; - g = f; - } - if ((e !=null && (d == null || d.trim().equals(""))) && (g != null && (f==null || f.trim().equals("")))) { - d = e; - f = g; - } - - // Check and convert coordinates to +/- decimal degrees - Double west = convertCoordinate(d); - Double east = convertCoordinate(e); - Double north = convertCoordinate(f); - Double south = convertCoordinate(g); - - // New Format for indexing coordinates in Solr 5.0 - minX, maxX, maxY, minY - // Note - storage in Solr follows the WENS order, but display is WSEN order - String result = String.format("ENVELOPE(%s,%s,%s,%s)", new Object[] { west, east, north, south }); - - if (validateCoordinates(west, east, north, south)) { - geo_coordinates.add(result); - } - } - } - return geo_coordinates; - } - - /** - * Get point coordinates for GoogleMap display. - * - * @param Record record - * @return List coordinates - */ - public List<String> getPointCoordinates(Record record) { - List<String> coordinates = new ArrayList<String>(); - List<VariableField> list034 = record.getVariableFields("034"); - if (list034 != null) { - for (VariableField vf : list034) { - DataField df = (DataField) vf; - String d = df.getSubfield('d').getData(); - String e = df.getSubfield('e').getData(); - String f = df.getSubfield('f').getData(); - String g = df.getSubfield('g').getData(); - - // Check to see if there are only 2 coordinates - if ((d !=null && (e == null || e.trim().equals(""))) && (f != null && (g==null || g.trim().equals("")))) { - Double long_val = convertCoordinate(d); - Double lat_val = convertCoordinate(f); - String longlatCoordinate = Double.toString(long_val) + ',' + Double.toString(lat_val); - coordinates.add(longlatCoordinate); - } - if ((e !=null && (d == null || d.trim().equals(""))) && (g != null && (f==null || f.trim().equals("")))) { - Double long_val = convertCoordinate(e); - Double lat_val = convertCoordinate(g); - String longlatCoordinate = Double.toString(long_val) + ',' + Double.toString(lat_val); - coordinates.add(longlatCoordinate); - } - // Check if N=S and E=W - if (d.equals(e) && f.equals(g)) { - Double long_val = convertCoordinate(d); - Double lat_val = convertCoordinate(f); - String longlatCoordinate = Double.toString(long_val) + ',' + Double.toString(lat_val); - coordinates.add(longlatCoordinate); - } - } - } - return coordinates; - } - - /** - * Get all available coordinates from the record. - * - * @param Record record - * @return List geo_coordinates - */ - public List<String> getDisplayCoordinates(Record record) { - List<String> geo_coordinates = new ArrayList<String>(); - List<VariableField> list034 = record.getVariableFields("034"); - if (list034 != null) { - for (VariableField vf : list034) { - DataField df = (DataField) vf; - String west = df.getSubfield('d').getData(); - String east = df.getSubfield('e').getData(); - String north = df.getSubfield('f').getData(); - String south = df.getSubfield('g').getData(); - String result = String.format("%s %s %s %s", new Object[] { west, east, north, south }); - if (west != null || east != null || north != null || south != null) { - geo_coordinates.add(result); - } - } - } - return geo_coordinates; - } - - /** - * Check coordinate type HDMS HDD or +/-DD. - * - * @param String coordinateStr - * @return Double coordinate - */ - protected Double convertCoordinate(String coordinateStr) { - Double coordinate = Double.NaN; - Matcher HDmatcher = HDMSHDD_PATTERN.matcher(coordinateStr); - Matcher PMDmatcher = PMDD_PATTERN.matcher(coordinateStr); - if (HDmatcher.matches()) { - String hemisphere = HDmatcher.group(1).toUpperCase(); - Double degrees = Double.parseDouble(HDmatcher.group(2)); - // Check for HDD or HDMS - if (hemisphere.equals("N") || hemisphere.equals("S")) { - if (degrees > 90) { - String hdmsCoordinate = hemisphere+"0"+HDmatcher.group(2); - coordinate = coordinateToDecimal(hdmsCoordinate); - } else { - coordinate = Double.parseDouble(HDmatcher.group(2)); - if (hemisphere.equals("S")) { - coordinate *= -1; - } - } - } - if (hemisphere.equals("E") || hemisphere.equals("W")) { - if (degrees > 180) { - String hdmsCoordinate = HDmatcher.group(0); - coordinate = coordinateToDecimal(hdmsCoordinate); - } else { - coordinate = Double.parseDouble(HDmatcher.group(2)); - if (hemisphere.equals("W")) { - coordinate *= -1; - } - } - } - return coordinate; - } else if (PMDmatcher.matches()) { - String hemisphere = PMDmatcher.group(1); - coordinate = Double.parseDouble(PMDmatcher.group(2)); - if (hemisphere.equals("-")) { - coordinate *= -1; - } - return coordinate; - } else { - return null; - } - } - - /** - * Convert HDMS coordinates to decimal degrees. - * - * @param String coordinateStr - * @return Double coordinate - */ - protected Double coordinateToDecimal(String coordinateStr) { - Matcher matcher = COORDINATES_PATTERN.matcher(coordinateStr); - if (matcher.matches()) { - String hemisphere = matcher.group(1).toUpperCase(); - int degrees = Integer.parseInt(matcher.group(2)); - int minutes = Integer.parseInt(matcher.group(3)); - int seconds = Integer.parseInt(matcher.group(4)); - double coordinate = degrees + (minutes / 60.0) + (seconds / 3600.0); - if (hemisphere.equals("W") || hemisphere.equals("S")) { - coordinate *= -1; - } - return coordinate; - } - return null; - } - - /** - * Check decimal degree coordinates to make sure they are valid. - * - * @param Double west, east, north, south - * @return boolean - */ - protected boolean validateCoordinates(Double west, Double east, Double north, Double south) { - if (west == null || east == null || north == null || south == null) { - return false; - } - if (west > 180.0 || west < -180.0 || east > 180.0 || east < -180.0) { - return false; - } - if (north > 90.0 || north < -90.0 || south > 90.0 || south < -90.0) { - return false; - } - if (north < south || west > east) { - return false; - } - return true; - } - - /** - * THIS FUNCTION HAS BEEN DEPRECATED. - * Determine the longitude and latitude of the items location. - * - * @param record current MARC record - * @return string of form "longitude, latitude" - */ - public String getLongLat(Record record) { - // Check 034 subfield d and f - List<VariableField> fields = record.getVariableFields("034"); - Iterator<VariableField> fieldsIter = fields.iterator(); - if (fields != null) { - DataField physical; - while(fieldsIter.hasNext()) { - physical = (DataField) fieldsIter.next(); - String val = null; - - List<Subfield> subfields_d = physical.getSubfields('d'); - Iterator<Subfield> subfieldsIter_d = subfields_d.iterator(); - if (subfields_d != null) { - while (subfieldsIter_d.hasNext()) { - val = subfieldsIter_d.next().getData().trim(); - if (!val.matches("-?\\d+(.\\d+)?")) { - return null; - } - } - } - List<Subfield> subfields_f = physical.getSubfields('f'); - Iterator<Subfield> subfieldsIter_f = subfields_f.iterator(); - if (subfields_f != null) { - while (subfieldsIter_f.hasNext()) { - String val2 = subfieldsIter_f.next().getData().trim(); - if (!val2.matches("-?\\d+(.\\d+)?")) { - return null; - } - val = val + ',' + val2; - } - } - return val; - } - } - //otherwise return null - return null; - } - - /** - * Update the index date in the database for the specified core/ID pair. We - * maintain a database of "first/last indexed" times separately from Solr to - * allow the history of our indexing activity to be stored permanently in a - * fashion that can survive even a total Solr rebuild. - */ - public UpdateDateTracker updateTracker(String core, String id, java.util.Date latestTransaction) - { - // Update the database (if necessary): - try { - // Initialize date tracker if not already initialized: - loadUpdateDateTracker(); - - tracker.index(core, id, latestTransaction); - } catch (java.sql.SQLException e) { - // If we're in the process of shutting down, an error is expected: - if (!shuttingDown) { - dieWithError("Unexpected database error"); - } - } - - // Send back the tracker object so the caller can use it (helpful for - // use in BeanShell scripts). - return tracker; - } - - /** - * Get the "first indexed" date for the current record. (This is the first - * time that SolrMarc ever encountered this particular record). - * - * @param record current MARC record - * @param fieldSpec fields / subfields to be analyzed - * @param core core name - * @return ID string - */ - public String getFirstIndexed(Record record, String fieldSpec, String core) { - // Update the database, then send back the first indexed date: - updateTracker(core, getFirstFieldVal(record, fieldSpec), getLatestTransaction(record)); - return tracker.getFirstIndexed(); - } - - /** - * Get the "first indexed" date for the current record. (This is the first - * time that SolrMarc ever encountered this particular record). - * - * @param record current MARC record - * @param fieldSpec fields / subfields to be analyzed - * @return ID string - */ - public String getFirstIndexed(Record record, String fieldSpec) { - return getFirstIndexed(record, fieldSpec, "biblio"); - } - - /** - * Get the "first indexed" date for the current record. (This is the first - * time that SolrMarc ever encountered this particular record). - * - * @param record current MARC record - * @return ID string - */ - public String getFirstIndexed(Record record) { - return getFirstIndexed(record, "001", "biblio"); - } - - /** - * Get the "last indexed" date for the current record. (This is the last time - * the record changed from SolrMarc's perspective). - * - * @param record current MARC record - * @param fieldSpec fields / subfields to be analyzed - * @param core core name - * @return ID string - */ - public String getLastIndexed(Record record, String fieldSpec, String core) { - // Update the database, then send back the last indexed date: - updateTracker(core, getFirstFieldVal(record, fieldSpec), getLatestTransaction(record)); - return tracker.getLastIndexed(); - } - - /** - * Get the "last indexed" date for the current record. (This is the last time - * the record changed from SolrMarc's perspective). - * - * @param record current MARC record - * @param fieldSpec fields / subfields to analyze - * @return ID string - */ - public String getLastIndexed(Record record, String fieldSpec) { - return getLastIndexed(record, fieldSpec, "biblio"); - } - - /** - * Get the "last indexed" date for the current record. (This is the last time - * the record changed from SolrMarc's perspective). - * - * @param record current MARC record - * @return ID string - */ - public String getLastIndexed(Record record) { - return getLastIndexed(record, "001", "biblio"); - } - - /** - * Load configurations for the full text parser. Return an array containing the - * parser type in the first element and the parser configuration in the second - * element. - * - * @return String[] - */ - public String[] getFulltextParserSettings() - { - String parserType = getConfigSetting( - "fulltext.ini", "General", "parser" - ); - if (null != parserType) { - parserType = parserType.toLowerCase(); - } - - // Is Aperture active? - String aperturePath = getConfigSetting( - "fulltext.ini", "Aperture", "webcrawler" - ); - if ((null == parserType && null != aperturePath) - || (null != parserType && parserType.equals("aperture")) - ) { - String[] array = { "aperture", aperturePath }; - return array; - } - - // Is Tika active? - String tikaPath = getConfigSetting( - "fulltext.ini", "Tika", "path" - ); - if ((null == parserType && null != tikaPath) - || (null != parserType && parserType.equals("tika")) - ) { - String[] array = { "tika", tikaPath }; - return array; - } - - // No recognized parser found: - String[] array = { "none", null }; - return array; - } - - /** - * Extract full-text from the documents referenced in the tags - * - * @param Record record current MARC record - * @param String field spec to search for URLs - * @param String only harvest files matching this extension (null for all) - * @return String The full-text - */ - public String getFulltext(Record record, String fieldSpec, String extension) { - String result = ""; - - // Get the web crawler settings (and return no text if it is unavailable) - String[] parserSettings = getFulltextParserSettings(); - if (parserSettings[0].equals("none")) { - return null; - } - - // Loop through the specified MARC fields: - Set<String> fields = getFieldList(record, fieldSpec); - Iterator<String> fieldsIter = fields.iterator(); - if (fields != null) { - while(fieldsIter.hasNext()) { - // Get the current string to work on (and sanitize spaces): - String current = fieldsIter.next().replaceAll(" ", "%20"); - // Filter by file extension - if (extension == null || current.endsWith(extension)) { - // Load the parser output for each tag into a string - result = result + harvestWithParser(current, parserSettings); - } - } - } - // return string to SolrMarc - return result; - } - - /** - * Extract full-text from the documents referenced in the tags - * - * @param Record record current MARC record - * @param String field spec to search for URLs - * @return String The full-text - */ - public String getFulltext(Record record, String fieldSpec) { - return getFulltext(record, fieldSpec, null); - } - - /** - * Extract full-text from the documents referenced in the tags - * - * @param Record record current MARC record - * @return String The full-text - */ - public String getFulltext(Record record) { - return getFulltext(record, "856u", null); - } - - /** - * Clean up XML data generated by Aperture - * - * @param f file to clean - * @return a fixed version of the file - */ - public File sanitizeApertureOutput(File f) throws IOException - { - //clean up the aperture xml output - File tempFile = File.createTempFile("buffer", ".tmp"); - FileOutputStream fw = new FileOutputStream(tempFile); - OutputStreamWriter writer = new OutputStreamWriter(fw, "UTF8"); - - //delete this control character from the File and save - FileReader fr = new FileReader(f); - BufferedReader br = new BufferedReader(fr); - while (br.ready()) { - writer.write(sanitizeFullText(br.readLine())); - } - writer.close(); - br.close(); - fr.close(); - - return tempFile; - } - - /** - * Clean up bad characters in the full text. - * - * @param text text to clean - * @return cleaned text - */ - public String sanitizeFullText(String text) - { - String badChars = "[^\\u0009\\u000A\\u000D\\u0020-\\uD7FF\\uE000-\\uFFFD\\u10000-\\u10FFFF]+"; - return text.replaceAll(badChars, " "); - } - - /** - * Harvest the contents of a document file (PDF, Word, etc.) using Aperture. - * This method will only work if Aperture is properly configured in the - * fulltext.ini file. Without proper configuration, this will simply return an - * empty string. - * - * @param url the url extracted from the MARC tag. - * @param aperturePath The path to Aperture - * @return full-text extracted from url - */ - public String harvestWithAperture(String url, String aperturePath) { - String plainText = ""; - // Create temp file. - File f = null; - try { - f = File.createTempFile("apt", ".txt"); - } catch (Throwable e) { - dieWithError("Unable to create temporary file for full text harvest."); - } - - // Delete temp file when program exits. - f.deleteOnExit(); - - // Construct the command to call Aperture - String cmd = aperturePath + " -o " + f.getAbsolutePath().toString() + " -x " + url; - - // Call Aperture - //System.out.println("Loading fulltext from " + url + ". Please wait ..."); - try { - Process p = Runtime.getRuntime().exec(cmd); - - // Debugging output - /* - BufferedReader stdInput = new BufferedReader(new - InputStreamReader(p.getInputStream())); - String s; - while ((s = stdInput.readLine()) != null) { - System.out.println(s); - } - */ - - // Wait for Aperture to finish - p.waitFor(); - } catch (Throwable e) { - logger.error("Problem executing Aperture -- " + e.getMessage()); - } - - // Parse Aperture XML output - Document xmlDoc = null; - try { - DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); - DocumentBuilder db = dbf.newDocumentBuilder(); - File tempFile = sanitizeApertureOutput(f); - xmlDoc = db.parse(tempFile); - NodeList nl = xmlDoc.getElementsByTagName("plainTextContent"); - if(nl != null && nl.getLength() > 0) { - Node node = nl.item(0); - if (node.getNodeType() == Node.ELEMENT_NODE) { - plainText = plainText + node.getTextContent(); - } - } - - // we'll hold onto the temp file if it failed to parse for debugging; - // only set it up to be deleted if we've made it this far successfully. - tempFile.deleteOnExit(); - } catch (Throwable e) { - logger.error("Problem parsing Aperture XML -- " + e.getMessage()); - } - - return plainText; - } - - /** - * Harvest the contents of a document file (PDF, Word, etc.) using Tika. - * This method will only work if Tika is properly configured in the fulltext.ini - * file. Without proper configuration, this will simply return an empty string. - * - * @param url the url extracted from the MARC tag. - * @param scraperPath path to Tika - * @return the full-text - */ - public String harvestWithTika(String url, String scraperPath) { - - // Construct the command - String cmd = "java -jar " + scraperPath + " -t -eUTF8 " + url; - - StringBuilder stringBuilder= new StringBuilder(); - - // Call our scraper - //System.out.println("Loading fulltext from " + url + ". Please wait ..."); - try { - Process p = Runtime.getRuntime().exec(cmd); - BufferedReader stdInput = new BufferedReader(new - InputStreamReader(p.getInputStream(), "UTF8")); - - // We'll build the string from the command output - String s; - while ((s = stdInput.readLine()) != null) { - stringBuilder.append(s); - } - } catch (Throwable e) { - logger.error("Problem with Tika -- " + e.getMessage()); - } - - return sanitizeFullText(stringBuilder.toString()); - } - - /** - * Harvest the contents of a document file (PDF, Word, etc.) using the active parser. - * - * @param url the URL extracted from the MARC tag. - * @param settings configuration settings from {@code getFulltextParserSettings}. - * @return the full-text - */ - public String harvestWithParser(String url, String[] settings) { - if (settings[0].equals("aperture")) { - return harvestWithAperture(url, settings[1]); - } else if (settings[0].equals("tika")) { - return harvestWithTika(url, settings[1]); - } - return null; - } - - /** - * Get access to the Logger object. - * - * @return Logger - */ - public Logger getLogger() - { - return logger; - } - - /** - * Extract all valid relator terms from a list of subfields using a whitelist. - * @param subfields List of subfields to check - * @param permittedRoles Whitelist to check against - * @param indexRawRelators Should we index relators raw, as found - * in the MARC (true) or index mapped versions (false)? - * @return Set of valid relator terms - */ - public Set<String> getValidRelatorsFromSubfields(List<Subfield> subfields, List<String> permittedRoles, Boolean indexRawRelators) - { - Set<String> relators = new LinkedHashSet<String>(); - for (int j = 0; j < subfields.size(); j++) { - String raw = subfields.get(j).getData(); - String current = normalizeRelatorString(raw); - if (permittedRoles.contains(current)) { - relators.add(indexRawRelators ? raw : mapRelatorStringToCode(current)); - } - } - return relators; - } - - /** - * Is this relator term unknown to author-classification.ini? - * @param current relator to check - * @return True if unknown - */ - public Boolean isUnknownRelator(String current) - { - // If we haven't loaded known relators yet, do so now: - if (knownRelators.size() == 0) { - Map<String, String> all = getConfigSection("author-classification.ini", "RelatorSynonyms"); - for (String key : all.keySet()) { - knownRelators.add(normalizeRelatorString(key)); - for (String synonym: all.get(key).split("\\|")) { - knownRelators.add(normalizeRelatorString(synonym)); - } - } - } - return !knownRelators.contains(normalizeRelatorString(current)); - } - - /** - * Extract all valid relator terms from a list of subfields using a whitelist. - * @param subfields List of subfields to check - * @return Set of valid relator terms - */ - public Set<String> getUnknownRelatorsFromSubfields(List<Subfield> subfields) - { - Set<String> relators = new LinkedHashSet<String>(); - for (int j = 0; j < subfields.size(); j++) { - String current = subfields.get(j).getData().trim(); - if (current.length() > 0 && isUnknownRelator(current)) { - logger.info("Unknown relator: " + current); - relators.add(current); - } - } - return relators; - } - - /** - * Extract all values that meet the specified relator requirements. - * @param authorField Field to analyze - * @param noRelatorAllowed Array of tag names which are allowed to be used with - * no declared relator. - * @param relatorConfig The setting in author-classification.ini which - * defines which relator terms are acceptable (or a colon-delimited list) - * @param unknownRelatorAllowed Array of tag names whose relators should be indexed - * even if they are not listed in author-classification.ini. - * @param indexRawRelators Set to "true" to index relators raw, as found - * in the MARC or "false" to index mapped versions. - * @return Set - */ - public Set<String> getValidRelators(DataField authorField, - String[] noRelatorAllowed, String relatorConfig, - String[] unknownRelatorAllowed, String indexRawRelators - ) { - // get tag number from Field - String tag = authorField.getTag(); - List<Subfield> subfieldE = authorField.getSubfields('e'); - List<Subfield> subfield4 = authorField.getSubfields('4'); - - Set<String> relators = new LinkedHashSet<String>(); - - // if no relator is found, check to see if the current tag is in the "no - // relator allowed" list. - if (subfieldE.size() == 0 && subfield4.size() == 0) { - if (Arrays.asList(noRelatorAllowed).contains(tag)) { - relators.add(""); - } - } else { - // If we got this far, we need to figure out what type of relation they have - List permittedRoles = normalizeRelatorStringList(Arrays.asList(loadRelatorConfig(relatorConfig))); - relators.addAll(getValidRelatorsFromSubfields(subfieldE, permittedRoles, indexRawRelators.toLowerCase().equals("true"))); - relators.addAll(getValidRelatorsFromSubfields(subfield4, permittedRoles, indexRawRelators.toLowerCase().equals("true"))); - if (Arrays.asList(unknownRelatorAllowed).contains(tag)) { - Set<String> unknown = getUnknownRelatorsFromSubfields(subfieldE); - if (unknown.size() == 0) { - unknown = getUnknownRelatorsFromSubfields(subfield4); - } - relators.addAll(unknown); - } - } - return relators; - } - - /** - * Parse a SolrMarc fieldspec into a map of tag name to set of subfield strings - * (note that we need to map to a set rather than a single string, because the - * same tag may repeat with different subfields to extract different sections - * of the same field into distinct values). - * - * @param tagList The field specification to parse - * @return HashMap - */ - protected HashMap<String, Set<String>> getParsedTagList(String tagList) - { - String[] tags = tagList.split(":");//convert string input to array - HashMap<String, Set<String>> tagMap = new HashMap<String, Set<String>>(); - //cut tags array up into key/value pairs in hash map - Set<String> currentSet; - for(int i = 0; i < tags.length; i++){ - String tag = tags[i].substring(0, 3); - if (!tagMap.containsKey(tag)) { - currentSet = new LinkedHashSet<String>(); - tagMap.put(tag, currentSet); - } else { - currentSet = tagMap.get(tag); - } - currentSet.add(tags[i].substring(3)); - } - return tagMap; - } - - /** - * Filter values retrieved using tagList to include only those whose relator - * values are acceptable. Used for separating different types of authors. - * - * @param record The record (fed in automatically) - * @param tagList The field specification to read - * @param acceptWithoutRelator Colon-delimited list of tags whose values should - * be accepted even if no relator subfield is defined - * @param relatorConfig The setting in author-classification.ini which - * defines which relator terms are acceptable (or a colon-delimited list) - * @param acceptUnknownRelators Colon-delimited list of tags whose relators - * should be indexed even if they are not listed in author-classification.ini. - * @param indexRawRelators Set to "true" to index relators raw, as found - * in the MARC or "false" to index mapped versions. - * @param firstOnly Return first result only? - * @return List result - */ - public List<String> getAuthorsFilteredByRelator(Record record, String tagList, - String acceptWithoutRelator, String relatorConfig, - String acceptUnknownRelators, String indexRawRelators, Boolean firstOnly - ) { - List<String> result = new LinkedList<String>(); - String[] noRelatorAllowed = acceptWithoutRelator.split(":"); - String[] unknownRelatorAllowed = acceptUnknownRelators.split(":"); - HashMap<String, Set<String>> parsedTagList = getParsedTagList(tagList); - List fields = getFieldSetMatchingTagList(record, tagList); - Iterator fieldsIter = fields.iterator(); - if (fields != null){ - DataField authorField; - while (fieldsIter.hasNext()){ - authorField = (DataField) fieldsIter.next(); - // add all author types to the result set; if we have multiple relators, repeat the authors - for (String iterator: getValidRelators(authorField, noRelatorAllowed, relatorConfig, unknownRelatorAllowed, indexRawRelators)) { - for (String subfields : parsedTagList.get(authorField.getTag())) { - String current = getDataFromVariableField(authorField, "["+subfields+"]", " ", false); - // TODO: we may eventually be able to use this line instead, - // but right now it's not handling separation between the - // subfields correctly, so it's commented out until that is - // fixed. - //String current = authorField.getSubfieldsAsString(subfields); - if (null != current) { - result.add(current); - if (firstOnly) { - return result; - } - } - } - } - } - } - return result; - } - - /** - * Filter values retrieved using tagList to include only those whose relator - * values are acceptable. Used for separating different types of authors. - * - * @param record The record (fed in automatically) - * @param tagList The field specification to read - * @param acceptWithoutRelator Colon-delimited list of tags whose values should - * be accepted even if no relator subfield is defined - * @param relatorConfig The setting in author-classification.ini which - * defines which relator terms are acceptable (or a colon-delimited list) - * @return List result - */ - public List<String> getAuthorsFilteredByRelator(Record record, String tagList, - String acceptWithoutRelator, String relatorConfig - ) { - // default firstOnly to false! - return getAuthorsFilteredByRelator( - record, tagList, acceptWithoutRelator, relatorConfig, - acceptWithoutRelator, "false", false - ); - } - - /** - * Filter values retrieved using tagList to include only those whose relator - * values are acceptable. Used for separating different types of authors. - * - * @param record The record (fed in automatically) - * @param tagList The field specification to read - * @param acceptWithoutRelator Colon-delimited list of tags whose values should - * be accepted even if no relator subfield is defined - * @param relatorConfig The setting in author-classification.ini which - * defines which relator terms are acceptable (or a colon-delimited list) - * @return List result - * @param acceptUnknownRelators Colon-delimited list of tags whose relators - * should be indexed even if they are not listed in author-classification.ini. - */ - public List<String> getAuthorsFilteredByRelator(Record record, String tagList, - String acceptWithoutRelator, String relatorConfig, - String acceptUnknownRelators - ) { - // default firstOnly to false! - return getAuthorsFilteredByRelator( - record, tagList, acceptWithoutRelator, relatorConfig, - acceptUnknownRelators, "false", false - ); - } - - /** - * Filter values retrieved using tagList to include only those whose relator - * values are acceptable. Used for separating different types of authors. - * - * @param record The record (fed in automatically) - * @param tagList The field specification to read - * @param acceptWithoutRelator Colon-delimited list of tags whose values should - * be accepted even if no relator subfield is defined - * @param relatorConfig The setting in author-classification.ini which - * defines which relator terms are acceptable (or a colon-delimited list) - * @return List result - * @param acceptUnknownRelators Colon-delimited list of tags whose relators - * should be indexed even if they are not listed in author-classification.ini. - * @param indexRawRelators Set to "true" to index relators raw, as found - * in the MARC or "false" to index mapped versions. - */ - public List<String> getAuthorsFilteredByRelator(Record record, String tagList, - String acceptWithoutRelator, String relatorConfig, - String acceptUnknownRelators, String indexRawRelators - ) { - // default firstOnly to false! - return getAuthorsFilteredByRelator( - record, tagList, acceptWithoutRelator, relatorConfig, - acceptUnknownRelators, indexRawRelators, false - ); - } - - /** - * If the provided relator is included in the synonym list, convert it back to - * a code (for better standardization/translation). - * - * @param relator Relator code to check - * @return Code version, if found, or raw string if no match found. - */ - public String mapRelatorStringToCode(String relator) - { - String normalizedRelator = normalizeRelatorString(relator); - return relatorSynonymLookup.containsKey(normalizedRelator) - ? relatorSynonymLookup.get(normalizedRelator) : relator; - } - - /** - * Filter values retrieved using tagList to include only those whose relator - * values are acceptable. Used for separating different types of authors. - * - * @param record The record (fed in automatically) - * @param tagList The field specification to read - * @param acceptWithoutRelator Colon-delimited list of tags whose values should - * be accepted even if no relator subfield is defined - * @param relatorConfig The setting in author-classification.ini which - * defines which relator terms are acceptable (or a colon-delimited list) - * @param acceptUnknownRelators Colon-delimited list of tags whose relators - * should be indexed even if they are not listed in author-classification.ini. - * @param indexRawRelators Set to "true" to index relators raw, as found - * in the MARC or "false" to index mapped versions. - * @return String - */ - public String getFirstAuthorFilteredByRelator(Record record, String tagList, - String acceptWithoutRelator, String relatorConfig, - String acceptUnknownRelators, String indexRawRelators - ) { - List<String> result = getAuthorsFilteredByRelator( - record, tagList, acceptWithoutRelator, relatorConfig, - acceptUnknownRelators, indexRawRelators, true - ); - for (String s : result) { - return s; - } - return null; - } - - /** - * Filter values retrieved using tagList to include only those whose relator - * values are acceptable. Used for separating different types of authors. - * - * @param record The record (fed in automatically) - * @param tagList The field specification to read - * @param acceptWithoutRelator Colon-delimited list of tags whose values should - * be accepted even if no relator subfield is defined - * @param relatorConfig The setting in author-classification.ini which - * defines which relator terms are acceptable (or a colon-delimited list) - * @return String - */ - public String getFirstAuthorFilteredByRelator(Record record, String tagList, - String acceptWithoutRelator, String relatorConfig - ) { - return getFirstAuthorFilteredByRelator( - record, tagList, acceptWithoutRelator, relatorConfig, - acceptWithoutRelator, "false" - ); - } - - /** - * Filter values retrieved using tagList to include only those whose relator - * values are acceptable. Used for separating different types of authors. - * - * @param record The record (fed in automatically) - * @param tagList The field specification to read - * @param acceptWithoutRelator Colon-delimited list of tags whose values should - * be accepted even if no relator subfield is defined - * @param relatorConfig The setting in author-classification.ini which - * defines which relator terms are acceptable (or a colon-delimited list) - * @param acceptUnknownRelators Colon-delimited list of tags whose relators - * should be indexed even if they are not listed in author-classification.ini. - * @return String - */ - public String getFirstAuthorFilteredByRelator(Record record, String tagList, - String acceptWithoutRelator, String relatorConfig, - String acceptUnknownRelators - ) { - return getFirstAuthorFilteredByRelator( - record, tagList, acceptWithoutRelator, relatorConfig, - acceptUnknownRelators, "false" - ); - } - - /** - * Filter values retrieved using tagList to include only those whose relator - * values are acceptable. Used for saving relators of authors separated by different - * types. - * - * @param record The record (fed in automatically) - * @param tagList The field specification to read - * @param acceptWithoutRelator Colon-delimited list of tags whose values should - * be accepted even if no relator subfield is defined - * @param relatorConfig The setting in author-classification.ini which - * defines which relator terms are acceptable (or a colon-delimited list) - * @param acceptUnknownRelators Colon-delimited list of tags whose relators - * should be indexed even if they are not listed in author-classification.ini. - * @param indexRawRelators Set to "true" to index relators raw, as found - * in the MARC or "false" to index mapped versions. - * @param firstOnly Return first result only? - * @return List result - */ - public List getRelatorsFilteredByRelator(Record record, String tagList, - String acceptWithoutRelator, String relatorConfig, - String acceptUnknownRelators, String indexRawRelators, Boolean firstOnly - ) { - List result = new LinkedList(); - String[] noRelatorAllowed = acceptWithoutRelator.split(":"); - String[] unknownRelatorAllowed = acceptUnknownRelators.split(":"); - HashMap<String, Set<String>> parsedTagList = getParsedTagList(tagList); - List fields = getFieldSetMatchingTagList(record, tagList); - Iterator fieldsIter = fields.iterator(); - if (fields != null){ - DataField authorField; - while (fieldsIter.hasNext()){ - authorField = (DataField) fieldsIter.next(); - //add all author types to the result set - result.addAll(getValidRelators(authorField, noRelatorAllowed, relatorConfig, unknownRelatorAllowed, indexRawRelators)); - } - } - return result; - } - - /** - * Filter values retrieved using tagList to include only those whose relator - * values are acceptable. Used for saving relators of authors separated by different - * types. - * - * @param record The record (fed in automatically) - * @param tagList The field specification to read - * @param acceptWithoutRelator Colon-delimited list of tags whose values should - * be accepted even if no relator subfield is defined - * @param relatorConfig The setting in author-classification.ini which - * defines which relator terms are acceptable (or a colon-delimited list) - * @param acceptUnknownRelators Colon-delimited list of tags whose relators - * should be indexed even if they are not listed in author-classification.ini. - * @param indexRawRelators Set to "true" to index relators raw, as found - * in the MARC or "false" to index mapped versions. - * @return List result - */ - public List getRelatorsFilteredByRelator(Record record, String tagList, - String acceptWithoutRelator, String relatorConfig, - String acceptUnknownRelators, String indexRawRelators - ) { - // default firstOnly to false! - return getRelatorsFilteredByRelator( - record, tagList, acceptWithoutRelator, relatorConfig, - acceptUnknownRelators, indexRawRelators, false - ); - } - - /** - * Filter values retrieved using tagList to include only those whose relator - * values are acceptable. Used for saving relators of authors separated by different - * types. - * - * @param record The record (fed in automatically) - * @param tagList The field specification to read - * @param acceptWithoutRelator Colon-delimited list of tags whose values should - * be accepted even if no relator subfield is defined - * @param relatorConfig The setting in author-classification.ini which - * defines which relator terms are acceptable (or a colon-delimited list) - * @param acceptUnknownRelators Colon-delimited list of tags whose relators - * should be indexed even if they are not listed in author-classification.ini. - * @return List result - */ - public List getRelatorsFilteredByRelator(Record record, String tagList, - String acceptWithoutRelator, String relatorConfig, - String acceptUnknownRelators - ) { - // default firstOnly to false! - return getRelatorsFilteredByRelator( - record, tagList, acceptWithoutRelator, relatorConfig, - acceptUnknownRelators, "false", false - ); - } - - /** - * Filter values retrieved using tagList to include only those whose relator - * values are acceptable. Used for saving relators of authors separated by different - * types. - * - * @param record The record (fed in automatically) - * @param tagList The field specification to read - * @param acceptWithoutRelator Colon-delimited list of tags whose values should - * be accepted even if no relator subfield is defined - * @param relatorConfig The setting in author-classification.ini which - * defines which relator terms are acceptable (or a colon-delimited list) - * @return List result - */ - public List getRelatorsFilteredByRelator(Record record, String tagList, - String acceptWithoutRelator, String relatorConfig - ) { - // default firstOnly to false! - return getRelatorsFilteredByRelator( - record, tagList, acceptWithoutRelator, relatorConfig, - acceptWithoutRelator, "false", false - ); - } - - /** - * This method fetches relator definitions from ini file and casts them to an - * array. If a colon-delimited string is passed in, this will be directly parsed - * instead of resorting to .ini loading. - * - * @param setting Setting to load from .ini or colon-delimited list. - * @return String[] - */ - protected String[] loadRelatorConfig(String setting){ - StringBuilder relators = new StringBuilder(); - - // check for pipe-delimited string - String[] relatorSettings = setting.split("\\|"); - for (String relatorSetting: relatorSettings) { - // check for colon-delimited string - String[] relatorArray = relatorSetting.split(":"); - if (relatorArray.length > 1) { - for (int i = 0; i < relatorArray.length; i++) { - relators.append(relatorArray[i]).append(","); - } - } else { - relators.append(getConfigSetting( - "author-classification.ini", "AuthorRoles", relatorSetting - )).append(","); - } - } - - return relators.toString().split(","); - } - - /** - * Normalizes a relator string and returns a list containing the normalized - * relator plus any configured synonyms. - * - * @param relator Relator term to normalize - * @return List of strings - */ - public List<String> normalizeRelatorAndAddSynonyms(String relator) - { - List<String> newList = new ArrayList<String>(); - String normalized = normalizeRelatorString(relator); - newList.add(normalized); - String synonyms = getConfigSetting( - "author-classification.ini", "RelatorSynonyms", relator - ); - if (null != synonyms && synonyms.length() > 0) { - for (String synonym: synonyms.split("\\|")) { - String normalizedSynonym = normalizeRelatorString(synonym); - relatorSynonymLookup.put(normalizedSynonym, relator); - newList.add(normalizedSynonym); - } - } - return newList; - } - - /** - * Normalizes the strings in a list. - * - * @param stringList List of strings to be normalized - * @return Normalized List of strings - */ - protected List<String> normalizeRelatorStringList(List<String> stringList) - { - List<String> newList = new ArrayList<String>(); - for (String relator: stringList) { - newList.addAll(normalizeRelatorAndAddSynonyms(relator)); - } - return newList; - } - - /** - * Normalizes a string - * - * @param string String to be normalized - * @return string - */ - protected String normalizeRelatorString(String string) - { - return string - .trim() - .toLowerCase() - .replaceAll("\\p{Punct}+", ""); //POSIX character class Punctuation: One of !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ - } - - /** - * Filter values retrieved using tagList to include only those whose relator - * values are acceptable. Used for separating different types of authors. - * - * @param record The record (fed in automatically) - * @param tagList The field specification to read - * @param acceptWithoutRelator Colon-delimited list of tags whose values should - * be accepted even if no relator subfield is defined - * @param relatorConfig The setting in author-classification.ini which - * defines which relator terms are acceptable (or a colon-delimited list) - * @param acceptUnknownRelators Colon-delimited list of tags whose relators - * should be indexed even if they are not listed in author-classification.ini. - * @param indexRawRelators Set to "true" to index relators raw, as found - * in the MARC or "false" to index mapped versions. - * @return List result - */ - public List<String> getAuthorInitialsFilteredByRelator(Record record, - String tagList, String acceptWithoutRelator, String relatorConfig, - String acceptUnknownRelators, String indexRawRelators - ) { - List<String> authors = getAuthorsFilteredByRelator( - record, tagList, acceptWithoutRelator, relatorConfig, - acceptUnknownRelators, indexRawRelators - ); - List<String> result = new LinkedList<String>(); - for (String author : authors) { - result.add(processInitials(author)); - } - return result; - } - - /** - * Filter values retrieved using tagList to include only those whose relator - * values are acceptable. Used for separating different types of authors. - * - * @param record The record (fed in automatically) - * @param tagList The field specification to read - * @param acceptWithoutRelator Colon-delimited list of tags whose values should - * be accepted even if no relator subfield is defined - * @param relatorConfig The setting in author-classification.ini which - * defines which relator terms are acceptable (or a colon-delimited list) - * @return List result - */ - public List<String> getAuthorInitialsFilteredByRelator(Record record, - String tagList, String acceptWithoutRelator, String relatorConfig - ) { - return getAuthorInitialsFilteredByRelator( - record, tagList, acceptWithoutRelator, relatorConfig, - acceptWithoutRelator, "false" - ); - } - - /** - * Filter values retrieved using tagList to include only those whose relator - * values are acceptable. Used for separating different types of authors. - * - * @param record The record (fed in automatically) - * @param tagList The field specification to read - * @param acceptWithoutRelator Colon-delimited list of tags whose values should - * be accepted even if no relator subfield is defined - * @param relatorConfig The setting in author-classification.ini which - * defines which relator terms are acceptable (or a colon-delimited list) - * @param acceptUnknownRelators Colon-delimited list of tags whose relators - * should be indexed even if they are not listed in author-classification.ini. - * @return List result - */ - public List<String> getAuthorInitialsFilteredByRelator(Record record, - String tagList, String acceptWithoutRelator, String relatorConfig, - String acceptUnknownRelators - ) { - return getAuthorInitialsFilteredByRelator( - record, tagList, acceptWithoutRelator, relatorConfig, - acceptUnknownRelators, "false" - ); - } - - /** - * Takes a name and cuts it into initials - * @param authorName e.g. Yeats, William Butler - * @return initials e.g. w b y wb - */ - protected String processInitials(String authorName) { - Boolean isPersonalName = false; - // we guess that if there is a comma before the end - this is a personal name - if ((authorName.indexOf(',') > 0) - && (authorName.indexOf(',') < authorName.length()-1)) { - isPersonalName = true; - } - // get rid of non-alphabet chars but keep hyphens and accents - authorName = authorName.replaceAll("[^\\p{L} -]", "").toLowerCase(); - String[] names = authorName.split(" "); //split into tokens on spaces - // if this is a personal name we'll reorganise to put lastname at the end - String result = ""; - if (isPersonalName) { - String lastName = names[0]; - for (int i = 0; i < names.length-1; i++) { - names[i] = names[i+1]; - } - names[names.length-1] = lastName; - } - // put all the initials together in a space separated string - for (String name : names) { - if (name.length() > 0) { - String initial = name.substring(0,1); - // if there is a hyphenated name, use both initials - int pos = name.indexOf('-'); - if (pos > 0 && pos < name.length() - 1) { - String extra = name.substring(pos+1, pos+2); - initial = initial + " " + extra; - } - result += " " + initial; - } - } - // grab all initials and stick them together - String smushAll = result.replaceAll(" ", ""); - // if it's a long personal name, get all but the last initials as well - // e.g. wb for william butler yeats - if (names.length > 2 && isPersonalName) { - String smushPers = result.substring(0,result.length()-1).replaceAll(" ",""); - result = result + " " + smushPers; - } - // now we have initials separate and together - if (!result.trim().equals(smushAll)) { - result += " " + smushAll; - } - result = result.trim(); - return result; - } - - /** - * Normalize trailing punctuation. This mimics the functionality built into VuFind's - * textFacet field type, so that you can get equivalent values when indexing into - * a string field. (Useful for docValues support). - * - * Can return null - * - * @param record current MARC record - * @param fieldSpec which MARC fields / subfields need to be analyzed - * @return Set containing normalized values - */ - public Set<String> normalizeTrailingPunctuation(Record record, String fieldSpec) { - // Initialize our return value: - Set<String> result = new LinkedHashSet<String>(); - - // Loop through the specified MARC fields: - Set<String> input = getFieldList(record, fieldSpec); - Pattern pattern = Pattern.compile("(?<!\b[A-Z])[.\\s]*$"); - for (String current: input) { - result.add(pattern.matcher(current).replaceAll("")); - } - - // If we found no matches, return null; otherwise, return our results: - return result.isEmpty() ? null : result; - } -} \ No newline at end of file diff --git a/import/index_java/src/org/vufind/index/CallNumberTools.java b/import/index_java/src/org/vufind/index/CallNumberTools.java new file mode 100644 index 0000000000000000000000000000000000000000..00138e4e1e533d5c400b9916e29ad7cbef7709a5 --- /dev/null +++ b/import/index_java/src/org/vufind/index/CallNumberTools.java @@ -0,0 +1,517 @@ +package org.vufind.index; +/** + * Call number indexing routines. + * + * Copyright (C) Villanova University 2017. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Iterator; +import java.util.LinkedHashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; +import org.marc4j.marc.Record; +import org.marc4j.marc.VariableField; +import org.marc4j.marc.DataField; +import org.marc4j.marc.Subfield; +import org.solrmarc.callnum.DeweyCallNumber; +import org.solrmarc.callnum.LCCallNumber; +import org.solrmarc.index.SolrIndexer; +import org.solrmarc.tools.CallNumUtils; + +/** + * Call number indexing routines. + */ +public class CallNumberTools +{ + /** + * Extract the full call number from a record, stripped of spaces + * @param record MARC record + * @return Call number label + * @deprecated Obsolete as of VuFind 2.4. + * This method exists only to support the VuFind call number search, version <= 2.3. + * As of VuFind 2.4, the munging for call number search in handled entirely in Solr. + */ + @Deprecated + public String getFullCallNumber(final Record record) { + + return(getFullCallNumber(record, "099ab:090ab:050ab")); + } + + /** + * Extract the full call number from a record, stripped of spaces + * @param record MARC record + * @param fieldSpec taglist for call number fields + * @return Call number label + * @deprecated Obsolete as of VuFind 2.4. + * This method exists only to support the VuFind call number search, version <= 2.3. + * As of VuFind 2.4, the munging for call number search in handled entirely in Solr. + */ + @Deprecated + public String getFullCallNumber(final Record record, String fieldSpec) { + + String val = SolrIndexer.instance().getFirstFieldVal(record, fieldSpec); + + if (val != null) { + return val.toUpperCase().replaceAll(" ", ""); + } else { + return val; + } + } + + /** + * Extract the call number label from a record + * @param record MARC record + * @return Call number label + */ + public String getCallNumberLabel(final Record record) { + + return getCallNumberLabel(record, "090a:050a"); + } + + /** + * Extract the call number label from a record + * @param record MARC record + * @param fieldSpec taglist for call number fields + * @return Call number label + */ + public String getCallNumberLabel(final Record record, String fieldSpec) { + + String val = SolrIndexer.instance().getFirstFieldVal(record, fieldSpec); + + if (val != null) { + int dotPos = val.indexOf("."); + if (dotPos > 0) { + val = val.substring(0, dotPos); + } + return val.toUpperCase(); + } else { + return val; + } + } + + /** + * Extract the subject component of the call number + * + * Can return null + * + * @param record MARC record + * @return Call number subject letters + */ + public String getCallNumberSubject(final Record record) { + + return(getCallNumberSubject(record, "090a:050a")); + } + + /** + * Extract the subject component of the call number + * + * Can return null + * + * @param record current MARC record + * @return Call number subject letters + */ + public String getCallNumberSubject(final Record record, String fieldSpec) { + + String val = SolrIndexer.instance().getFirstFieldVal(record, fieldSpec); + + if (val != null) { + String [] callNumberSubject = val.toUpperCase().split("[^A-Z]+"); + if (callNumberSubject.length > 0) + { + return callNumberSubject[0]; + } + } + return(null); + } + + /** + * Normalize a single LC call number + * @param record current MARC record + * @return String Normalized LCCN + */ + public String getFullCallNumberNormalized(final Record record) { + + return(getFullCallNumberNormalized(record, "099ab:090ab:050ab")); + } + + /** + * Normalize a single LC call number + * @param record current MARC record + * @param fieldSpec which MARC fields / subfields need to be analyzed + * @return String Normalized LC call number + */ + public String getFullCallNumberNormalized(final Record record, String fieldSpec) { + + // TODO: is the null fieldSpec still an issue? + if (fieldSpec != null) { + String cn = SolrIndexer.instance().getFirstFieldVal(record, fieldSpec); + return (new LCCallNumber(cn)).getShelfKey(); + } + // If we got this far, we couldn't find a valid value: + return null; + } + + /** + * Get call numbers of a specific type. + * + * <p>{@code fieldSpec} is of form {@literal 098abc:099ab}, does not accept subfield ranges. + * + * + * @param record current MARC record + * @param fieldSpec which MARC fields / subfields need to be analyzed + * @param callTypeSf subfield containing call number type, single character only + * @param callType literal call number code + * @param result a collection to gather the call numbers + * @return collection of call numbers, same object as {@code result} + */ + public static Collection<String> getCallNumberByTypeCollector( + Record record, String fieldSpec, String callTypeSf, String callType, Collection<String> result) { + for (String tag : fieldSpec.split(":")) { + // Check to ensure tag length is at least 3 characters + if (tag.length() < 3) { + //TODO: Should this go to a log? Better message for a bad tag in a field spec? + System.err.println("Invalid tag specified: " + tag); + continue; + } + String dfTag = tag.substring(0, 3); + String sfSpec = null; + if (tag.length() > 3) { + sfSpec = tag.substring(3); + } + + // do all fields for this tag + for (VariableField vf : record.getVariableFields(dfTag)) { + // Assume tag represents a DataField + DataField df = (DataField) vf; + boolean callTypeMatch = false; + + // Assume call type subfield could repeat + for (Subfield typeSf : df.getSubfields(callTypeSf)) { + if (callTypeSf.indexOf(typeSf.getCode()) != -1 && typeSf.getData().equals(callType)) { + callTypeMatch = true; + } + } + System.err.println("callTypeMatch after loop: " + callTypeMatch); + if (callTypeMatch) { + result.add(df.getSubfieldsAsString(sfSpec)); + } + } // end loop over variable fields + } // end loop over fieldSpec + return result; + } + + + /** + * Get call numbers of a specific type. + * + * <p>{@code fieldSpec} is of form {@literal 098abc:099ab}, does not accept subfield ranges. + * + * @param record current MARC record + * @param fieldSpec which MARC fields / subfields need to be analyzed + * @param callTypeSf subfield containing call number type, single character only + * @param callType literal call number code + * @return set of call numbers + */ + public static Set<String> getCallNumberByType(Record record, String fieldSpec, String callTypeSf, String callType) { + return (Set<String>) getCallNumberByTypeCollector(record, fieldSpec, callTypeSf, callType, + new LinkedHashSet<String>()); + } + + /** + * Get call numbers of a specific type. + * + * <p>{@code fieldSpec} is of form {@literal 098abc:099ab}, does not accept subfield ranges. + * + * @param record current MARC record + * @param fieldSpec which MARC fields / subfields need to be analyzed + * @param callTypeSf subfield containing call number type, single character only + * @param callType literal call number code + * @return list of call numbers + */ + public static List<String> getCallNumberByTypeAsList(Record record, String fieldSpec, String callTypeSf, String callType) { + return (List<String>) getCallNumberByTypeCollector(record, fieldSpec, callTypeSf, callType, + new ArrayList<String>()); + } + + /** + * Normalize LC numbers for sorting purposes (use only the first valid number!). + * Will return first call number found if none pass validation, + * or empty string if no call numbers. + * + * @param record current MARC record + * @param fieldSpec which MARC fields / subfields need to be analyzed + * @return sortable shelf key of the first valid LC number encountered, + * otherwise shelf key of the first call number found. + */ + public String getLCSortable(Record record, String fieldSpec) { + // Loop through the specified MARC fields: + Set<String> input = SolrIndexer.instance().getFieldList(record, fieldSpec); + String firstCall = ""; + for (String current : input) { + // If this is a valid LC number, return the sortable shelf key: + LCCallNumber callNum = new LCCallNumber(current); + if (callNum.isValid()) { + return callNum.getShelfKey(); // RETURN first valid + } + if (firstCall.length() == 0) { + firstCall = current; + } + } + + // If we made it this far, did not find a valid LC number, so use what we have: + return new LCCallNumber(firstCall).getShelfKey(); + } + + /** + * Get sort key for first LC call number, identified by call type. + * + * <p>{@code fieldSpec} is of form {@literal 098abc:099ab}, does not accept subfield ranges. + * + * + * @param record current MARC record + * @param fieldSpec which MARC fields / subfields need to be analyzed + * @param callTypeSf subfield containing call number type, single character only + * @param callType literal call number code + * @return sort key for first identified LC call number + */ + public String getLCSortableByType( + Record record, String fieldSpec, String callTypeSf, String callType) { + String sortKey = null; + for (String tag : fieldSpec.split(":")) { + // Check to ensure tag length is at least 3 characters + if (tag.length() < 3) { + //TODO: Should this go to a log? Better message for a bad tag in a field spec? + System.err.println("Invalid tag specified: " + tag); + continue; + } + String dfTag = tag.substring(0, 3); + String sfSpec = null; + if (tag.length() > 3) { + sfSpec = tag.substring(3); + } + + // do all fields for this tag + for (VariableField vf : record.getVariableFields(dfTag)) { + // Assume tag represents a DataField + DataField df = (DataField) vf; + boolean callTypeMatch = false; + + // Assume call type subfield could repeat + for (Subfield typeSf : df.getSubfields(callTypeSf)) { + if (callTypeSf.indexOf(typeSf.getCode()) != -1 && typeSf.getData().equals(callType)) { + callTypeMatch = true; + } + } + // take the first call number coded as LC + if (callTypeMatch) { + sortKey = new LCCallNumber(df.getSubfieldsAsString(sfSpec)).getShelfKey(); + break; + } + } // end loop over variable fields + } // end loop over fieldSpec + return sortKey; + } + + /** + * Extract a numeric portion of the Dewey decimal call number + * + * Can return null + * + * @param record current MARC record + * @param fieldSpec which MARC fields / subfields need to be analyzed + * @param precisionStr a decimal number (represented in string format) showing the + * desired precision of the returned number; i.e. 100 to round to nearest hundred, + * 10 to round to nearest ten, 0.1 to round to nearest tenth, etc. + * @return Set containing requested numeric portions of Dewey decimal call numbers + */ + public Set<String> getDeweyNumber(Record record, String fieldSpec, String precisionStr) { + // Initialize our return value: + Set<String> result = new LinkedHashSet<String>(); + + // Precision comes in as a string, but we need to convert it to a float: + float precision = Float.parseFloat(precisionStr); + + // Loop through the specified MARC fields: + Set<String> input = SolrIndexer.instance().getFieldList(record, fieldSpec); + for (String current: input) { + DeweyCallNumber callNum = new DeweyCallNumber(current); + if (callNum.isValid()) { + // Convert the numeric portion of the call number into a float: + float currentVal = Float.parseFloat(callNum.getClassification()); + + // Round the call number value to the specified precision: + Float finalVal = new Float(Math.floor(currentVal / precision) * precision); + + // Convert the rounded value back to a string (with leading zeros) and save it: + // TODO: Provide different conversion to remove CallNumUtils dependency + result.add(CallNumUtils.normalizeFloat(finalVal.toString(), 3, -1)); + } + } + + // If we found no call number matches, return null; otherwise, return our results: + if (result.isEmpty()) + return null; + return result; + } + + /** + * Normalize Dewey numbers for searching purposes (uppercase/stripped spaces) + * + * Can return null + * + * @param record current MARC record + * @param fieldSpec which MARC fields / subfields need to be analyzed + * @return Set containing normalized Dewey numbers extracted from specified fields. + */ + public Set<String> getDeweySearchable(Record record, String fieldSpec) { + // Initialize our return value: + Set<String> result = new LinkedHashSet<String>(); + + // Loop through the specified MARC fields: + Set<String> input = SolrIndexer.instance().getFieldList(record, fieldSpec); + Iterator<String> iter = input.iterator(); + while (iter.hasNext()) { + // Get the current string to work on: + String current = iter.next(); + + // Add valid strings to the set, normalizing them to be all uppercase + // and free from whitespace. + DeweyCallNumber callNum = new DeweyCallNumber(current); + if (callNum.isValid()) { + result.add(callNum.toString().toUpperCase().replaceAll(" ", "")); + } + } + + // If we found no call numbers, return null; otherwise, return our results: + if (result.isEmpty()) + return null; + return result; + } + + /** + * Normalize Dewey numbers for sorting purposes (use only the first valid number!) + * + * Can return null + * + * @param record current MARC record + * @param fieldSpec which MARC fields / subfields need to be analyzed + * @return String containing the first valid Dewey number encountered, normalized + * for sorting purposes. + */ + public String getDeweySortable(Record record, String fieldSpec) { + // Loop through the specified MARC fields: + Set<String> input = SolrIndexer.instance().getFieldList(record, fieldSpec); + Iterator<String> iter = input.iterator(); + while (iter.hasNext()) { + // Get the current string to work on: + String current = iter.next(); + + // If this is a valid Dewey number, return the sortable shelf key: + DeweyCallNumber callNum = new DeweyCallNumber(current); + if (callNum.isValid()) { + return callNum.getShelfKey(); + } + } + + // If we made it this far, we didn't find a valid sortable Dewey number: + return null; + } + + /** + * Get sort key for first Dewey call number, identified by call type. + * + * <p>{@code fieldSpec} is of form {@literal 098abc:099ab}, does not accept subfield ranges. + * + * + * @param record current MARC record + * @param fieldSpec which MARC fields / subfields need to be analyzed + * @param callTypeSf subfield containing call number type, single character only + * @param callType literal call number code + * @return sort key for first identified Dewey call number + */ + public static String getDeweySortableByType( + Record record, String fieldSpec, String callTypeSf, String callType) { + String sortKey = null; + for (String tag : fieldSpec.split(":")) { + // Check to ensure tag length is at least 3 characters + if (tag.length() < 3) { + //TODO: Should this go to a log? Better message for a bad tag in a field spec? + System.err.println("Invalid tag specified: " + tag); + continue; + } + String dfTag = tag.substring(0, 3); + String sfSpec = null; + if (tag.length() > 3) { + sfSpec = tag.substring(3); + } + + // do all fields for this tag + for (VariableField vf : record.getVariableFields(dfTag)) { + // Assume tag represents a DataField + DataField df = (DataField) vf; + boolean callTypeMatch = false; + + // Assume call type subfield could repeat + for (Subfield typeSf : df.getSubfields(callTypeSf)) { + if (callTypeSf.indexOf(typeSf.getCode()) != -1 && typeSf.getData().equals(callType)) { + callTypeMatch = true; + } + } + // take the first call number coded as Dewey + if (callTypeMatch) { + sortKey = new DeweyCallNumber(df.getSubfieldsAsString(sfSpec)).getShelfKey(); + break; + } + } // end loop over variable fields + } // end loop over fieldSpec + return sortKey; + } + + + /** + * Normalize Dewey numbers for AlphaBrowse sorting purposes (use all numbers!) + * + * Can return null + * + * @param record current MARC record + * @param fieldSpec which MARC fields / subfields need to be analyzed + * @return List containing normalized Dewey numbers extracted from specified fields. + */ + public List<String> getDeweySortables(Record record, String fieldSpec) { + // Initialize our return value: + List<String> result = new LinkedList<String>(); + + // Loop through the specified MARC fields: + Set<String> input = SolrIndexer.instance().getFieldList(record, fieldSpec); + Iterator<String> iter = input.iterator(); + while (iter.hasNext()) { + // Get the current string to work on: + String current = iter.next(); + + // gather all sort keys, even if number is not valid + DeweyCallNumber callNum = new DeweyCallNumber(current); + result.add(callNum.getShelfKey()); + } + + // If we found no call numbers, return null; otherwise, return our results: + if (result.isEmpty()) + return null; + return result; + } +} \ No newline at end of file diff --git a/import/index_java/src/org/vufind/index/ConfigManager.java b/import/index_java/src/org/vufind/index/ConfigManager.java new file mode 100644 index 0000000000000000000000000000000000000000..d9f25087780f1f1cb74304638973b56ba62e1a99 --- /dev/null +++ b/import/index_java/src/org/vufind/index/ConfigManager.java @@ -0,0 +1,234 @@ +package org.vufind.index; +/** + * VuFind configuration manager + * + * Copyright (C) Villanova University 2017. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +import java.io.File; +import java.io.FileReader; +import java.util.Map; +import java.util.Properties; +import java.util.concurrent.ConcurrentHashMap; +import org.solrmarc.index.indexer.ValueIndexerFactory; +import org.solrmarc.tools.PropertyUtils; +import org.solrmarc.tools.SolrMarcIndexerException; +import org.ini4j.Ini; +import org.apache.log4j.Logger; + +/** + * VuFind configuration manager + */ +public class ConfigManager +{ + // Initialize logging category + static Logger logger = Logger.getLogger(ConfigManager.class.getName()); + private static ConcurrentHashMap<String, Ini> configCache = new ConcurrentHashMap<String, Ini>(); + private Properties vuFindConfigs = null; + private static ThreadLocal<ConfigManager> managerCache = + new ThreadLocal<ConfigManager>() + { + @Override + protected ConfigManager initialValue() + { + return new ConfigManager(); + } + }; + + public ConfigManager() + { + try { + vuFindConfigs = PropertyUtils.loadProperties(ValueIndexerFactory.instance().getHomeDirs(), "vufind.properties"); + } catch (IllegalArgumentException e) { + // If the properties load failed, don't worry about it -- we'll use defaults. + } + } + + public static ConfigManager instance() + { + return managerCache.get(); + } + + /** + * Given the base name of a configuration file, locate the full path. + * @param filename base name of a configuration file + */ + private File findConfigFile(String filename) + { + // Find VuFind's home directory in the environment; if it's not available, + // try using a relative path on the assumption that we are currently in + // VuFind's import subdirectory: + String vufindHome = System.getenv("VUFIND_HOME"); + if (vufindHome == null) { + vufindHome = ".."; + } + + // Check for VuFind 2.0's local directory environment variable: + String vufindLocal = System.getenv("VUFIND_LOCAL_DIR"); + + // Get the relative VuFind path from the properties file, defaulting to + // the 2.0-style config/vufind if necessary. + String relativeConfigPath = PropertyUtils.getProperty( + vuFindConfigs, "vufind.config.relative_path", "config/vufind" + ); + + // Try several different locations for the file -- VuFind 2 local dir, + // VuFind 2 base dir, VuFind 1 base dir. + File file; + if (vufindLocal != null) { + file = new File(vufindLocal + "/" + relativeConfigPath + "/" + filename); + if (file.exists()) { + return file; + } + } + file = new File(vufindHome + "/" + relativeConfigPath + "/" + filename); + if (file.exists()) { + return file; + } + file = new File(vufindHome + "/web/conf/" + filename); + return file; + } + + /** + * Sanitize a VuFind configuration setting. + * @param str configuration setting + */ + private String sanitizeConfigSetting(String str) + { + // Drop comments if necessary: + int pos = str.indexOf(';'); + if (pos >= 0) { + str = str.substring(0, pos).trim(); + } + + // Strip wrapping quotes if necessary (the ini reader won't do this for us): + if (str.startsWith("\"")) { + str = str.substring(1, str.length()); + } + if (str.endsWith("\"")) { + str = str.substring(0, str.length() - 1); + } + return str; + } + + /** + * Load an ini file. + * @param filename name of {@code .ini} file + */ + public Ini loadConfigFile(String filename) + { + // Retrieve the file if it is not already cached. + if (!configCache.containsKey(filename)) { + Ini ini = new Ini(); + try { + ini.load(new FileReader(findConfigFile(filename))); + configCache.putIfAbsent(filename, ini); + } catch (Throwable e) { + dieWithError("Unable to access " + filename); + } + } + return configCache.get(filename); + } + + /** + * Get a section from a VuFind configuration file. + * @param filename configuration file name + * @param section section name within the file + */ + public Map<String, String> getConfigSection(String filename, String section) + { + // Grab the ini file. + Ini ini = loadConfigFile(filename); + Map<String, String> retVal = ini.get(section); + + String parent = ini.get("Parent_Config", "path"); + while (parent != null) { + Ini parentIni = loadConfigFile(parent); + Map<String, String> parentSection = parentIni.get(section); + for (String key : parentSection.keySet()) { + if (!retVal.containsKey(key)) { + retVal.put(key, parentSection.get(key)); + } + } + parent = parentIni.get("Parent_Config", "path"); + } + + // Check to see if we need to worry about an override file: + String override = ini.get("Extra_Config", "local_overrides"); + if (override != null) { + Map<String, String> overrideSection = loadConfigFile(override).get(section); + for (String key : overrideSection.keySet()) { + retVal.put(key, overrideSection.get(key)); + } + } + return retVal; + } + + /** + * Get a setting from a VuFind configuration file. + * @param filename configuration file name + * @param section section name within the file + * @param setting setting name within the section + */ + public String getConfigSetting(String filename, String section, String setting) + { + String retVal = null; + + // Grab the ini file. + Ini ini = loadConfigFile(filename); + + // Check to see if we need to worry about an override file: + String override = ini.get("Extra_Config", "local_overrides"); + if (override != null) { + Ini overrideIni = loadConfigFile(override); + retVal = overrideIni.get(section, setting); + if (retVal != null) { + return sanitizeConfigSetting(retVal); + } + } + + // Try to find the requested setting: + retVal = ini.get(section, setting); + + // No setting? Check for a parent configuration: + while (retVal == null) { + String parent = ini.get("Parent_Config", "path"); + if (parent != null) { + try { + ini.load(new FileReader(new File(parent))); + } catch (Throwable e) { + dieWithError("Unable to access " + parent); + } + retVal = ini.get(section, setting); + } else { + break; + } + } + + // Return the processed setting: + return retVal == null ? null : sanitizeConfigSetting(retVal); + } + + /** + * Log an error message and throw a fatal exception. + * @param msg message to log + */ + private void dieWithError(String msg) + { + logger.error(msg); + throw new SolrMarcIndexerException(SolrMarcIndexerException.EXIT, msg); + } +} \ No newline at end of file diff --git a/import/index_java/src/org/vufind/index/CreatorTools.java b/import/index_java/src/org/vufind/index/CreatorTools.java new file mode 100644 index 0000000000000000000000000000000000000000..3a50dd9ad89fb8144528229fdc9638533c5f7bb6 --- /dev/null +++ b/import/index_java/src/org/vufind/index/CreatorTools.java @@ -0,0 +1,725 @@ +package org.vufind.index; +/** + * Indexing routines for dealing with creators and relator terms. + * + * Copyright (C) Villanova University 2017. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +import org.marc4j.marc.Record; +import org.marc4j.marc.Subfield; +import org.marc4j.marc.DataField; +import org.solrmarc.index.SolrIndexer; +import org.apache.log4j.Logger; +import java.util.Arrays; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedHashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; + +/** + * Indexing routines for dealing with creators and relator terms. + */ +public class CreatorTools +{ + // Initialize logging category + static Logger logger = Logger.getLogger(CreatorTools.class.getName()); + + private ConcurrentHashMap<String, String> relatorSynonymLookup = RelatorContainer.instance().getSynonymLookup(); + private Set<String> knownRelators = RelatorContainer.instance().getKnownRelators(); + + /** + * Extract all valid relator terms from a list of subfields using a whitelist. + * @param subfields List of subfields to check + * @param permittedRoles Whitelist to check against + * @param indexRawRelators Should we index relators raw, as found + * in the MARC (true) or index mapped versions (false)? + * @return Set of valid relator terms + */ + public Set<String> getValidRelatorsFromSubfields(List<Subfield> subfields, List<String> permittedRoles, Boolean indexRawRelators) + { + Set<String> relators = new LinkedHashSet<String>(); + for (int j = 0; j < subfields.size(); j++) { + String raw = subfields.get(j).getData(); + String current = normalizeRelatorString(raw); + if (permittedRoles.contains(current)) { + relators.add(indexRawRelators ? raw : mapRelatorStringToCode(current)); + } + } + return relators; + } + + /** + * Is this relator term unknown to author-classification.ini? + * @param current relator to check + * @return True if unknown + */ + public Boolean isUnknownRelator(String current) + { + // If we haven't loaded known relators yet, do so now: + if (knownRelators.size() == 0) { + Map<String, String> all = ConfigManager.instance().getConfigSection("author-classification.ini", "RelatorSynonyms"); + for (String key : all.keySet()) { + knownRelators.add(normalizeRelatorString(key)); + for (String synonym: all.get(key).split("\\|")) { + knownRelators.add(normalizeRelatorString(synonym)); + } + } + } + return !knownRelators.contains(normalizeRelatorString(current)); + } + + /** + * Extract all valid relator terms from a list of subfields using a whitelist. + * @param subfields List of subfields to check + * @return Set of valid relator terms + */ + public Set<String> getUnknownRelatorsFromSubfields(List<Subfield> subfields) + { + Set<String> relators = new LinkedHashSet<String>(); + for (int j = 0; j < subfields.size(); j++) { + String current = subfields.get(j).getData().trim(); + if (current.length() > 0 && isUnknownRelator(current)) { + logger.info("Unknown relator: " + current); + relators.add(current); + } + } + return relators; + } + + /** + * Extract all values that meet the specified relator requirements. + * @param authorField Field to analyze + * @param noRelatorAllowed Array of tag names which are allowed to be used with + * no declared relator. + * @param relatorConfig The setting in author-classification.ini which + * defines which relator terms are acceptable (or a colon-delimited list) + * @param unknownRelatorAllowed Array of tag names whose relators should be indexed + * even if they are not listed in author-classification.ini. + * @param indexRawRelators Set to "true" to index relators raw, as found + * in the MARC or "false" to index mapped versions. + * @return Set + */ + public Set<String> getValidRelators(DataField authorField, + String[] noRelatorAllowed, String relatorConfig, + String[] unknownRelatorAllowed, String indexRawRelators + ) { + // get tag number from Field + String tag = authorField.getTag(); + List<Subfield> subfieldE = authorField.getSubfields('e'); + List<Subfield> subfield4 = authorField.getSubfields('4'); + + Set<String> relators = new LinkedHashSet<String>(); + + // if no relator is found, check to see if the current tag is in the "no + // relator allowed" list. + if (subfieldE.size() == 0 && subfield4.size() == 0) { + if (Arrays.asList(noRelatorAllowed).contains(tag)) { + relators.add(""); + } + } else { + // If we got this far, we need to figure out what type of relation they have + List permittedRoles = normalizeRelatorStringList(Arrays.asList(loadRelatorConfig(relatorConfig))); + relators.addAll(getValidRelatorsFromSubfields(subfieldE, permittedRoles, indexRawRelators.toLowerCase().equals("true"))); + relators.addAll(getValidRelatorsFromSubfields(subfield4, permittedRoles, indexRawRelators.toLowerCase().equals("true"))); + if (Arrays.asList(unknownRelatorAllowed).contains(tag)) { + Set<String> unknown = getUnknownRelatorsFromSubfields(subfieldE); + if (unknown.size() == 0) { + unknown = getUnknownRelatorsFromSubfields(subfield4); + } + relators.addAll(unknown); + } + } + return relators; + } + + /** + * Parse a SolrMarc fieldspec into a map of tag name to set of subfield strings + * (note that we need to map to a set rather than a single string, because the + * same tag may repeat with different subfields to extract different sections + * of the same field into distinct values). + * + * @param tagList The field specification to parse + * @return HashMap + */ + protected HashMap<String, Set<String>> getParsedTagList(String tagList) + { + String[] tags = tagList.split(":");//convert string input to array + HashMap<String, Set<String>> tagMap = new HashMap<String, Set<String>>(); + //cut tags array up into key/value pairs in hash map + Set<String> currentSet; + for(int i = 0; i < tags.length; i++){ + String tag = tags[i].substring(0, 3); + if (!tagMap.containsKey(tag)) { + currentSet = new LinkedHashSet<String>(); + tagMap.put(tag, currentSet); + } else { + currentSet = tagMap.get(tag); + } + currentSet.add(tags[i].substring(3)); + } + return tagMap; + } + + /** + * Filter values retrieved using tagList to include only those whose relator + * values are acceptable. Used for separating different types of authors. + * + * @param record The record (fed in automatically) + * @param tagList The field specification to read + * @param acceptWithoutRelator Colon-delimited list of tags whose values should + * be accepted even if no relator subfield is defined + * @param relatorConfig The setting in author-classification.ini which + * defines which relator terms are acceptable (or a colon-delimited list) + * @param acceptUnknownRelators Colon-delimited list of tags whose relators + * should be indexed even if they are not listed in author-classification.ini. + * @param indexRawRelators Set to "true" to index relators raw, as found + * in the MARC or "false" to index mapped versions. + * @param firstOnly Return first result only? + * @return List result + */ + public List<String> getAuthorsFilteredByRelator(Record record, String tagList, + String acceptWithoutRelator, String relatorConfig, + String acceptUnknownRelators, String indexRawRelators, Boolean firstOnly + ) { + List<String> result = new LinkedList<String>(); + String[] noRelatorAllowed = acceptWithoutRelator.split(":"); + String[] unknownRelatorAllowed = acceptUnknownRelators.split(":"); + HashMap<String, Set<String>> parsedTagList = getParsedTagList(tagList); + List fields = SolrIndexer.instance().getFieldSetMatchingTagList(record, tagList); + Iterator fieldsIter = fields.iterator(); + if (fields != null){ + DataField authorField; + while (fieldsIter.hasNext()){ + authorField = (DataField) fieldsIter.next(); + // add all author types to the result set; if we have multiple relators, repeat the authors + for (String iterator: getValidRelators(authorField, noRelatorAllowed, relatorConfig, unknownRelatorAllowed, indexRawRelators)) { + for (String subfields : parsedTagList.get(authorField.getTag())) { + String current = SolrIndexer.instance().getDataFromVariableField(authorField, "["+subfields+"]", " ", false); + // TODO: we may eventually be able to use this line instead, + // but right now it's not handling separation between the + // subfields correctly, so it's commented out until that is + // fixed. + //String current = authorField.getSubfieldsAsString(subfields); + if (null != current) { + result.add(current); + if (firstOnly) { + return result; + } + } + } + } + } + } + return result; + } + + /** + * Filter values retrieved using tagList to include only those whose relator + * values are acceptable. Used for separating different types of authors. + * + * @param record The record (fed in automatically) + * @param tagList The field specification to read + * @param acceptWithoutRelator Colon-delimited list of tags whose values should + * be accepted even if no relator subfield is defined + * @param relatorConfig The setting in author-classification.ini which + * defines which relator terms are acceptable (or a colon-delimited list) + * @return List result + */ + public List<String> getAuthorsFilteredByRelator(Record record, String tagList, + String acceptWithoutRelator, String relatorConfig + ) { + // default firstOnly to false! + return getAuthorsFilteredByRelator( + record, tagList, acceptWithoutRelator, relatorConfig, + acceptWithoutRelator, "false", false + ); + } + + /** + * Filter values retrieved using tagList to include only those whose relator + * values are acceptable. Used for separating different types of authors. + * + * @param record The record (fed in automatically) + * @param tagList The field specification to read + * @param acceptWithoutRelator Colon-delimited list of tags whose values should + * be accepted even if no relator subfield is defined + * @param relatorConfig The setting in author-classification.ini which + * defines which relator terms are acceptable (or a colon-delimited list) + * @return List result + * @param acceptUnknownRelators Colon-delimited list of tags whose relators + * should be indexed even if they are not listed in author-classification.ini. + */ + public List<String> getAuthorsFilteredByRelator(Record record, String tagList, + String acceptWithoutRelator, String relatorConfig, + String acceptUnknownRelators + ) { + // default firstOnly to false! + return getAuthorsFilteredByRelator( + record, tagList, acceptWithoutRelator, relatorConfig, + acceptUnknownRelators, "false", false + ); + } + + /** + * Filter values retrieved using tagList to include only those whose relator + * values are acceptable. Used for separating different types of authors. + * + * @param record The record (fed in automatically) + * @param tagList The field specification to read + * @param acceptWithoutRelator Colon-delimited list of tags whose values should + * be accepted even if no relator subfield is defined + * @param relatorConfig The setting in author-classification.ini which + * defines which relator terms are acceptable (or a colon-delimited list) + * @return List result + * @param acceptUnknownRelators Colon-delimited list of tags whose relators + * should be indexed even if they are not listed in author-classification.ini. + * @param indexRawRelators Set to "true" to index relators raw, as found + * in the MARC or "false" to index mapped versions. + */ + public List<String> getAuthorsFilteredByRelator(Record record, String tagList, + String acceptWithoutRelator, String relatorConfig, + String acceptUnknownRelators, String indexRawRelators + ) { + // default firstOnly to false! + return getAuthorsFilteredByRelator( + record, tagList, acceptWithoutRelator, relatorConfig, + acceptUnknownRelators, indexRawRelators, false + ); + } + + /** + * If the provided relator is included in the synonym list, convert it back to + * a code (for better standardization/translation). + * + * @param relator Relator code to check + * @return Code version, if found, or raw string if no match found. + */ + public String mapRelatorStringToCode(String relator) + { + String normalizedRelator = normalizeRelatorString(relator); + return relatorSynonymLookup.containsKey(normalizedRelator) + ? relatorSynonymLookup.get(normalizedRelator) : relator; + } + + /** + * Filter values retrieved using tagList to include only those whose relator + * values are acceptable. Used for separating different types of authors. + * + * @param record The record (fed in automatically) + * @param tagList The field specification to read + * @param acceptWithoutRelator Colon-delimited list of tags whose values should + * be accepted even if no relator subfield is defined + * @param relatorConfig The setting in author-classification.ini which + * defines which relator terms are acceptable (or a colon-delimited list) + * @param acceptUnknownRelators Colon-delimited list of tags whose relators + * should be indexed even if they are not listed in author-classification.ini. + * @param indexRawRelators Set to "true" to index relators raw, as found + * in the MARC or "false" to index mapped versions. + * @return String + */ + public String getFirstAuthorFilteredByRelator(Record record, String tagList, + String acceptWithoutRelator, String relatorConfig, + String acceptUnknownRelators, String indexRawRelators + ) { + List<String> result = getAuthorsFilteredByRelator( + record, tagList, acceptWithoutRelator, relatorConfig, + acceptUnknownRelators, indexRawRelators, true + ); + for (String s : result) { + return s; + } + return null; + } + + /** + * Filter values retrieved using tagList to include only those whose relator + * values are acceptable. Used for separating different types of authors. + * + * @param record The record (fed in automatically) + * @param tagList The field specification to read + * @param acceptWithoutRelator Colon-delimited list of tags whose values should + * be accepted even if no relator subfield is defined + * @param relatorConfig The setting in author-classification.ini which + * defines which relator terms are acceptable (or a colon-delimited list) + * @return String + */ + public String getFirstAuthorFilteredByRelator(Record record, String tagList, + String acceptWithoutRelator, String relatorConfig + ) { + return getFirstAuthorFilteredByRelator( + record, tagList, acceptWithoutRelator, relatorConfig, + acceptWithoutRelator, "false" + ); + } + + /** + * Filter values retrieved using tagList to include only those whose relator + * values are acceptable. Used for separating different types of authors. + * + * @param record The record (fed in automatically) + * @param tagList The field specification to read + * @param acceptWithoutRelator Colon-delimited list of tags whose values should + * be accepted even if no relator subfield is defined + * @param relatorConfig The setting in author-classification.ini which + * defines which relator terms are acceptable (or a colon-delimited list) + * @param acceptUnknownRelators Colon-delimited list of tags whose relators + * should be indexed even if they are not listed in author-classification.ini. + * @return String + */ + public String getFirstAuthorFilteredByRelator(Record record, String tagList, + String acceptWithoutRelator, String relatorConfig, + String acceptUnknownRelators + ) { + return getFirstAuthorFilteredByRelator( + record, tagList, acceptWithoutRelator, relatorConfig, + acceptUnknownRelators, "false" + ); + } + + /** + * Filter values retrieved using tagList to include only those whose relator + * values are acceptable. Used for saving relators of authors separated by different + * types. + * + * @param record The record (fed in automatically) + * @param tagList The field specification to read + * @param acceptWithoutRelator Colon-delimited list of tags whose values should + * be accepted even if no relator subfield is defined + * @param relatorConfig The setting in author-classification.ini which + * defines which relator terms are acceptable (or a colon-delimited list) + * @param acceptUnknownRelators Colon-delimited list of tags whose relators + * should be indexed even if they are not listed in author-classification.ini. + * @param indexRawRelators Set to "true" to index relators raw, as found + * in the MARC or "false" to index mapped versions. + * @param firstOnly Return first result only? + * @return List result + */ + public List getRelatorsFilteredByRelator(Record record, String tagList, + String acceptWithoutRelator, String relatorConfig, + String acceptUnknownRelators, String indexRawRelators, Boolean firstOnly + ) { + List result = new LinkedList(); + String[] noRelatorAllowed = acceptWithoutRelator.split(":"); + String[] unknownRelatorAllowed = acceptUnknownRelators.split(":"); + HashMap<String, Set<String>> parsedTagList = getParsedTagList(tagList); + List fields = SolrIndexer.instance().getFieldSetMatchingTagList(record, tagList); + Iterator fieldsIter = fields.iterator(); + if (fields != null){ + DataField authorField; + while (fieldsIter.hasNext()){ + authorField = (DataField) fieldsIter.next(); + //add all author types to the result set + result.addAll(getValidRelators(authorField, noRelatorAllowed, relatorConfig, unknownRelatorAllowed, indexRawRelators)); + } + } + return result; + } + + /** + * Filter values retrieved using tagList to include only those whose relator + * values are acceptable. Used for saving relators of authors separated by different + * types. + * + * @param record The record (fed in automatically) + * @param tagList The field specification to read + * @param acceptWithoutRelator Colon-delimited list of tags whose values should + * be accepted even if no relator subfield is defined + * @param relatorConfig The setting in author-classification.ini which + * defines which relator terms are acceptable (or a colon-delimited list) + * @param acceptUnknownRelators Colon-delimited list of tags whose relators + * should be indexed even if they are not listed in author-classification.ini. + * @param indexRawRelators Set to "true" to index relators raw, as found + * in the MARC or "false" to index mapped versions. + * @return List result + */ + public List getRelatorsFilteredByRelator(Record record, String tagList, + String acceptWithoutRelator, String relatorConfig, + String acceptUnknownRelators, String indexRawRelators + ) { + // default firstOnly to false! + return getRelatorsFilteredByRelator( + record, tagList, acceptWithoutRelator, relatorConfig, + acceptUnknownRelators, indexRawRelators, false + ); + } + + /** + * Filter values retrieved using tagList to include only those whose relator + * values are acceptable. Used for saving relators of authors separated by different + * types. + * + * @param record The record (fed in automatically) + * @param tagList The field specification to read + * @param acceptWithoutRelator Colon-delimited list of tags whose values should + * be accepted even if no relator subfield is defined + * @param relatorConfig The setting in author-classification.ini which + * defines which relator terms are acceptable (or a colon-delimited list) + * @param acceptUnknownRelators Colon-delimited list of tags whose relators + * should be indexed even if they are not listed in author-classification.ini. + * @return List result + */ + public List getRelatorsFilteredByRelator(Record record, String tagList, + String acceptWithoutRelator, String relatorConfig, + String acceptUnknownRelators + ) { + // default firstOnly to false! + return getRelatorsFilteredByRelator( + record, tagList, acceptWithoutRelator, relatorConfig, + acceptUnknownRelators, "false", false + ); + } + + /** + * Filter values retrieved using tagList to include only those whose relator + * values are acceptable. Used for saving relators of authors separated by different + * types. + * + * @param record The record (fed in automatically) + * @param tagList The field specification to read + * @param acceptWithoutRelator Colon-delimited list of tags whose values should + * be accepted even if no relator subfield is defined + * @param relatorConfig The setting in author-classification.ini which + * defines which relator terms are acceptable (or a colon-delimited list) + * @return List result + */ + public List getRelatorsFilteredByRelator(Record record, String tagList, + String acceptWithoutRelator, String relatorConfig + ) { + // default firstOnly to false! + return getRelatorsFilteredByRelator( + record, tagList, acceptWithoutRelator, relatorConfig, + acceptWithoutRelator, "false", false + ); + } + + /** + * This method fetches relator definitions from ini file and casts them to an + * array. If a colon-delimited string is passed in, this will be directly parsed + * instead of resorting to .ini loading. + * + * @param setting Setting to load from .ini or colon-delimited list. + * @return String[] + */ + protected String[] loadRelatorConfig(String setting){ + StringBuilder relators = new StringBuilder(); + + // check for pipe-delimited string + String[] relatorSettings = setting.split("\\|"); + for (String relatorSetting: relatorSettings) { + // check for colon-delimited string + String[] relatorArray = relatorSetting.split(":"); + if (relatorArray.length > 1) { + for (int i = 0; i < relatorArray.length; i++) { + relators.append(relatorArray[i]).append(","); + } + } else { + relators.append(ConfigManager.instance().getConfigSetting( + "author-classification.ini", "AuthorRoles", relatorSetting + )).append(","); + } + } + + return relators.toString().split(","); + } + + /** + * Normalizes a relator string and returns a list containing the normalized + * relator plus any configured synonyms. + * + * @param relator Relator term to normalize + * @return List of strings + */ + public List<String> normalizeRelatorAndAddSynonyms(String relator) + { + List<String> newList = new ArrayList<String>(); + String normalized = normalizeRelatorString(relator); + newList.add(normalized); + String synonyms = ConfigManager.instance().getConfigSetting( + "author-classification.ini", "RelatorSynonyms", relator + ); + if (null != synonyms && synonyms.length() > 0) { + for (String synonym: synonyms.split("\\|")) { + String normalizedSynonym = normalizeRelatorString(synonym); + relatorSynonymLookup.put(normalizedSynonym, relator); + newList.add(normalizedSynonym); + } + } + return newList; + } + + /** + * Normalizes the strings in a list. + * + * @param stringList List of strings to be normalized + * @return Normalized List of strings + */ + protected List<String> normalizeRelatorStringList(List<String> stringList) + { + List<String> newList = new ArrayList<String>(); + for (String relator: stringList) { + newList.addAll(normalizeRelatorAndAddSynonyms(relator)); + } + return newList; + } + + /** + * Normalizes a string + * + * @param string String to be normalized + * @return string + */ + protected String normalizeRelatorString(String string) + { + return string + .trim() + .toLowerCase() + .replaceAll("\\p{Punct}+", ""); //POSIX character class Punctuation: One of !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ + } + + /** + * Filter values retrieved using tagList to include only those whose relator + * values are acceptable. Used for separating different types of authors. + * + * @param record The record (fed in automatically) + * @param tagList The field specification to read + * @param acceptWithoutRelator Colon-delimited list of tags whose values should + * be accepted even if no relator subfield is defined + * @param relatorConfig The setting in author-classification.ini which + * defines which relator terms are acceptable (or a colon-delimited list) + * @param acceptUnknownRelators Colon-delimited list of tags whose relators + * should be indexed even if they are not listed in author-classification.ini. + * @param indexRawRelators Set to "true" to index relators raw, as found + * in the MARC or "false" to index mapped versions. + * @return List result + */ + public List<String> getAuthorInitialsFilteredByRelator(Record record, + String tagList, String acceptWithoutRelator, String relatorConfig, + String acceptUnknownRelators, String indexRawRelators + ) { + List<String> authors = getAuthorsFilteredByRelator( + record, tagList, acceptWithoutRelator, relatorConfig, + acceptUnknownRelators, indexRawRelators + ); + List<String> result = new LinkedList<String>(); + for (String author : authors) { + result.add(processInitials(author)); + } + return result; + } + + /** + * Filter values retrieved using tagList to include only those whose relator + * values are acceptable. Used for separating different types of authors. + * + * @param record The record (fed in automatically) + * @param tagList The field specification to read + * @param acceptWithoutRelator Colon-delimited list of tags whose values should + * be accepted even if no relator subfield is defined + * @param relatorConfig The setting in author-classification.ini which + * defines which relator terms are acceptable (or a colon-delimited list) + * @return List result + */ + public List<String> getAuthorInitialsFilteredByRelator(Record record, + String tagList, String acceptWithoutRelator, String relatorConfig + ) { + return getAuthorInitialsFilteredByRelator( + record, tagList, acceptWithoutRelator, relatorConfig, + acceptWithoutRelator, "false" + ); + } + + /** + * Filter values retrieved using tagList to include only those whose relator + * values are acceptable. Used for separating different types of authors. + * + * @param record The record (fed in automatically) + * @param tagList The field specification to read + * @param acceptWithoutRelator Colon-delimited list of tags whose values should + * be accepted even if no relator subfield is defined + * @param relatorConfig The setting in author-classification.ini which + * defines which relator terms are acceptable (or a colon-delimited list) + * @param acceptUnknownRelators Colon-delimited list of tags whose relators + * should be indexed even if they are not listed in author-classification.ini. + * @return List result + */ + public List<String> getAuthorInitialsFilteredByRelator(Record record, + String tagList, String acceptWithoutRelator, String relatorConfig, + String acceptUnknownRelators + ) { + return getAuthorInitialsFilteredByRelator( + record, tagList, acceptWithoutRelator, relatorConfig, + acceptUnknownRelators, "false" + ); + } + + /** + * Takes a name and cuts it into initials + * @param authorName e.g. Yeats, William Butler + * @return initials e.g. w b y wb + */ + protected String processInitials(String authorName) { + Boolean isPersonalName = false; + // we guess that if there is a comma before the end - this is a personal name + if ((authorName.indexOf(',') > 0) + && (authorName.indexOf(',') < authorName.length()-1)) { + isPersonalName = true; + } + // get rid of non-alphabet chars but keep hyphens and accents + authorName = authorName.replaceAll("[^\\p{L} -]", "").toLowerCase(); + String[] names = authorName.split(" "); //split into tokens on spaces + // if this is a personal name we'll reorganise to put lastname at the end + String result = ""; + if (isPersonalName) { + String lastName = names[0]; + for (int i = 0; i < names.length-1; i++) { + names[i] = names[i+1]; + } + names[names.length-1] = lastName; + } + // put all the initials together in a space separated string + for (String name : names) { + if (name.length() > 0) { + String initial = name.substring(0,1); + // if there is a hyphenated name, use both initials + int pos = name.indexOf('-'); + if (pos > 0 && pos < name.length() - 1) { + String extra = name.substring(pos+1, pos+2); + initial = initial + " " + extra; + } + result += " " + initial; + } + } + // grab all initials and stick them together + String smushAll = result.replaceAll(" ", ""); + // if it's a long personal name, get all but the last initials as well + // e.g. wb for william butler yeats + if (names.length > 2 && isPersonalName) { + String smushPers = result.substring(0,result.length()-1).replaceAll(" ",""); + result = result + " " + smushPers; + } + // now we have initials separate and together + if (!result.trim().equals(smushAll)) { + result += " " + smushAll; + } + result = result.trim(); + return result; + } +} \ No newline at end of file diff --git a/import/index_java/src/org/vufind/index/DatabaseManager.java b/import/index_java/src/org/vufind/index/DatabaseManager.java new file mode 100644 index 0000000000000000000000000000000000000000..43a4cb429d8aa74b451684978649cc280da05f66 --- /dev/null +++ b/import/index_java/src/org/vufind/index/DatabaseManager.java @@ -0,0 +1,156 @@ +package org.vufind.index; +/** + * Database manager. + * + * Copyright (C) Villanova University 2017. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +import org.apache.log4j.Logger; +import org.solrmarc.tools.SolrMarcIndexerException; +import java.sql.*; + +/** + * Database manager. + */ +public class DatabaseManager +{ + // Initialize logging category + static Logger logger = Logger.getLogger(DatabaseManager.class.getName()); + + // Initialize VuFind database connection (null until explicitly activated) + private Connection vufindDatabase = null; + + // Shutdown flag: + private boolean shuttingDown = false; + + private static ThreadLocal<DatabaseManager> managerCache = + new ThreadLocal<DatabaseManager>() + { + @Override + protected DatabaseManager initialValue() + { + return new DatabaseManager(); + } + }; + + public static DatabaseManager instance() + { + return managerCache.get(); + } + + /** + * Log an error message and throw a fatal exception. + * @param msg message to log + */ + private void dieWithError(String msg) + { + logger.error(msg); + throw new SolrMarcIndexerException(SolrMarcIndexerException.EXIT, msg); + } + + /** + * Connect to the VuFind database if we do not already have a connection. + */ + private void connectToDatabase() + { + // Already connected? Do nothing further! + if (vufindDatabase != null) { + return; + } + + String dsn = ConfigManager.instance().getConfigSetting("config.ini", "Database", "database"); + + try { + // Parse key settings from the PHP-style DSN: + String username = ""; + String password = ""; + String classname = "invalid"; + String prefix = "invalid"; + if (dsn.substring(0, 8).equals("mysql://")) { + classname = "com.mysql.jdbc.Driver"; + prefix = "mysql"; + } else if (dsn.substring(0, 8).equals("pgsql://")) { + classname = "org.postgresql.Driver"; + prefix = "postgresql"; + } + + Class.forName(classname).newInstance(); + String[] parts = dsn.split("://"); + if (parts.length > 1) { + parts = parts[1].split("@"); + if (parts.length > 1) { + dsn = prefix + "://" + parts[1]; + parts = parts[0].split(":"); + username = parts[0]; + if (parts.length > 1) { + password = parts[1]; + } + } + } + + // Connect to the database: + vufindDatabase = DriverManager.getConnection("jdbc:" + dsn, username, password); + } catch (Throwable e) { + dieWithError("Unable to connect to VuFind database"); + } + + Runtime.getRuntime().addShutdownHook(new DatabaseManagerShutdownThread(this)); + } + + private void disconnectFromDatabase() + { + if (vufindDatabase != null) { + try { + vufindDatabase.close(); + } catch (SQLException e) { + System.err.println("Unable to disconnect from VuFind database"); + logger.error("Unable to disconnect from VuFind database"); + } + } + } + + public void shutdown() + { + disconnectFromDatabase(); + shuttingDown = true; + } + + public Connection getConnection() + { + connectToDatabase(); + return vufindDatabase; + } + + public boolean isShuttingDown() + { + return shuttingDown; + } + + class DatabaseManagerShutdownThread extends Thread + { + private DatabaseManager manager; + + public DatabaseManagerShutdownThread(DatabaseManager m) + { + manager = m; + } + + public void run() + { + manager.shutdown(); + } + } +} \ No newline at end of file diff --git a/import/index_java/src/org/vufind/index/DateTools.java b/import/index_java/src/org/vufind/index/DateTools.java new file mode 100644 index 0000000000000000000000000000000000000000..343b382df1cf3376150ec12df90cc61be3a57f71 --- /dev/null +++ b/import/index_java/src/org/vufind/index/DateTools.java @@ -0,0 +1,103 @@ +package org.vufind.index; +/** + * Date indexing routines. + * + * Copyright (C) Villanova University 2017. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +import org.marc4j.marc.Record; +import org.marc4j.marc.VariableField; +import org.marc4j.marc.DataField; +import org.marc4j.marc.Subfield; +import org.solrmarc.tools.DataUtil; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; + +/** + * Date indexing routines. + */ +public class DateTools +{ + /** + * Get all available dates from the record. + * + * @param record MARC record + * @return set of dates + */ + public Set<String> getDates(final Record record) { + Set<String> dates = new LinkedHashSet<String>(); + + // First check old-style 260c date: + List<VariableField> list260 = record.getVariableFields("260"); + for (VariableField vf : list260) { + DataField df = (DataField) vf; + List<Subfield> currentDates = df.getSubfields('c'); + for (Subfield sf : currentDates) { + String currentDateStr = DataUtil.cleanDate(sf.getData()); + if (currentDateStr != null) dates.add(currentDateStr); + } + } + + // Now track down relevant RDA-style 264c dates; we only care about + // copyright and publication dates (and ignore copyright dates if + // publication dates are present). + Set<String> pubDates = new LinkedHashSet<String>(); + Set<String> copyDates = new LinkedHashSet<String>(); + List<VariableField> list264 = record.getVariableFields("264"); + for (VariableField vf : list264) { + DataField df = (DataField) vf; + List<Subfield> currentDates = df.getSubfields('c'); + for (Subfield sf : currentDates) { + String currentDateStr = DataUtil.cleanDate(sf.getData()); + char ind2 = df.getIndicator2(); + switch (ind2) + { + case '1': + if (currentDateStr != null) pubDates.add(currentDateStr); + break; + case '4': + if (currentDateStr != null) copyDates.add(currentDateStr); + break; + } + } + } + if (pubDates.size() > 0) { + dates.addAll(pubDates); + } else if (copyDates.size() > 0) { + dates.addAll(copyDates); + } + + return dates; + } + + /** + * Get the earliest publication date from the record. + * + * @param record MARC record + * @return earliest date + */ + public String getFirstDate(final Record record) { + String result = null; + Set<String> dates = getDates(record); + for(String current: dates) { + if (result == null || Integer.parseInt(current) < Integer.parseInt(result)) { + result = current; + } + } + return result; + } +} \ No newline at end of file diff --git a/import/index_java/src/org/vufind/index/FormatCalculator.java b/import/index_java/src/org/vufind/index/FormatCalculator.java new file mode 100644 index 0000000000000000000000000000000000000000..0c97299cae8c73a1a52fd399e2024bf16480d9c5 --- /dev/null +++ b/import/index_java/src/org/vufind/index/FormatCalculator.java @@ -0,0 +1,327 @@ +package org.vufind.index; +/** + * Format determination logic. + * + * Copyright (C) Villanova University 2017. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +import org.marc4j.marc.Record; +import org.marc4j.marc.ControlField; +import org.marc4j.marc.DataField; +import org.marc4j.marc.VariableField; +import java.util.Iterator; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; + +/** + * Format determination logic. + */ +public class FormatCalculator +{ + /** + * Determine Record Format(s) + * + * @param record MARC record + * @return set of record formats + */ + public Set<String> getFormat(final Record record){ + Set<String> result = new LinkedHashSet<String>(); + String leader = record.getLeader().toString(); + char leaderBit; + ControlField fixedField = (ControlField) record.getVariableField("008"); + DataField title = (DataField) record.getVariableField("245"); + String formatString; + char formatCode = ' '; + char formatCode2 = ' '; + char formatCode4 = ' '; + + // check if there's an h in the 245 + if (title != null) { + if (title.getSubfield('h') != null){ + if (title.getSubfield('h').getData().toLowerCase().contains("[electronic resource]")) { + result.add("Electronic"); + return result; + } + } + } + + // check the 007 - this is a repeating field + List<VariableField> fields = record.getVariableFields("007"); + Iterator<VariableField> fieldsIter = fields.iterator(); + if (fields != null) { + // TODO: update loop to for(:) syntax, but problem with type casting. + ControlField formatField; + while(fieldsIter.hasNext()) { + formatField = (ControlField) fieldsIter.next(); + formatString = formatField.getData().toUpperCase(); + formatCode = formatString.length() > 0 ? formatString.charAt(0) : ' '; + formatCode2 = formatString.length() > 1 ? formatString.charAt(1) : ' '; + formatCode4 = formatString.length() > 4 ? formatString.charAt(4) : ' '; + switch (formatCode) { + case 'A': + switch(formatCode2) { + case 'D': + result.add("Atlas"); + break; + default: + result.add("Map"); + break; + } + break; + case 'C': + switch(formatCode2) { + case 'A': + result.add("TapeCartridge"); + break; + case 'B': + result.add("ChipCartridge"); + break; + case 'C': + result.add("DiscCartridge"); + break; + case 'F': + result.add("TapeCassette"); + break; + case 'H': + result.add("TapeReel"); + break; + case 'J': + result.add("FloppyDisk"); + break; + case 'M': + case 'O': + result.add("CDROM"); + break; + case 'R': + // Do not return - this will cause anything with an + // 856 field to be labeled as "Electronic" + break; + default: + result.add("Software"); + break; + } + break; + case 'D': + result.add("Globe"); + break; + case 'F': + result.add("Braille"); + break; + case 'G': + switch(formatCode2) { + case 'C': + case 'D': + result.add("Filmstrip"); + break; + case 'T': + result.add("Transparency"); + break; + default: + result.add("Slide"); + break; + } + break; + case 'H': + result.add("Microfilm"); + break; + case 'K': + switch(formatCode2) { + case 'C': + result.add("Collage"); + break; + case 'D': + result.add("Drawing"); + break; + case 'E': + result.add("Painting"); + break; + case 'F': + result.add("Print"); + break; + case 'G': + result.add("Photonegative"); + break; + case 'J': + result.add("Print"); + break; + case 'L': + result.add("Drawing"); + break; + case 'O': + result.add("FlashCard"); + break; + case 'N': + result.add("Chart"); + break; + default: + result.add("Photo"); + break; + } + break; + case 'M': + switch(formatCode2) { + case 'F': + result.add("VideoCassette"); + break; + case 'R': + result.add("Filmstrip"); + break; + default: + result.add("MotionPicture"); + break; + } + break; + case 'O': + result.add("Kit"); + break; + case 'Q': + result.add("MusicalScore"); + break; + case 'R': + result.add("SensorImage"); + break; + case 'S': + switch(formatCode2) { + case 'D': + result.add("SoundDisc"); + break; + case 'S': + result.add("SoundCassette"); + break; + default: + result.add("SoundRecording"); + break; + } + break; + case 'V': + switch(formatCode2) { + case 'C': + result.add("VideoCartridge"); + break; + case 'D': + switch(formatCode4) { + case 'S': + result.add("BRDisc"); + break; + case 'V': + default: + result.add("VideoDisc"); + break; + } + break; + case 'F': + result.add("VideoCassette"); + break; + case 'R': + result.add("VideoReel"); + break; + default: + result.add("Video"); + break; + } + break; + } + } + if (!result.isEmpty()) { + return result; + } + } + + // check the Leader at position 6 + leaderBit = leader.charAt(6); + switch (Character.toUpperCase(leaderBit)) { + case 'C': + case 'D': + result.add("MusicalScore"); + break; + case 'E': + case 'F': + result.add("Map"); + break; + case 'G': + result.add("Slide"); + break; + case 'I': + result.add("SoundRecording"); + break; + case 'J': + result.add("MusicRecording"); + break; + case 'K': + result.add("Photo"); + break; + case 'M': + result.add("Electronic"); + break; + case 'O': + case 'P': + result.add("Kit"); + break; + case 'R': + result.add("PhysicalObject"); + break; + case 'T': + result.add("Manuscript"); + break; + } + if (!result.isEmpty()) { + return result; + } + + // check the Leader at position 7 + leaderBit = leader.charAt(7); + switch (Character.toUpperCase(leaderBit)) { + // Monograph + case 'M': + if (formatCode == 'C') { + result.add("eBook"); + } else { + result.add("Book"); + } + break; + // Component parts + case 'A': + result.add("BookComponentPart"); + break; + case 'B': + result.add("SerialComponentPart"); + break; + // Serial + case 'S': + // Look in 008 to determine what type of Continuing Resource + formatCode = fixedField.getData().toUpperCase().charAt(21); + switch (formatCode) { + case 'N': + result.add("Newspaper"); + break; + case 'P': + result.add("Journal"); + break; + default: + result.add("Serial"); + break; + } + } + + // Nothing worked! + if (result.isEmpty()) { + result.add("Unknown"); + } + + return result; + } +} \ No newline at end of file diff --git a/import/index_java/src/org/vufind/index/FullTextTools.java b/import/index_java/src/org/vufind/index/FullTextTools.java new file mode 100644 index 0000000000000000000000000000000000000000..0ab0bb3c7198cd465d1b158290154ca868f148c3 --- /dev/null +++ b/import/index_java/src/org/vufind/index/FullTextTools.java @@ -0,0 +1,311 @@ +package org.vufind.index; +/** + * Full text retrieval indexing routines. + * + * Copyright (C) Villanova University 2017. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +import org.marc4j.marc.Record; +import java.io.*; +import java.util.Iterator; +import java.util.Set; +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.apache.log4j.Logger; +import org.solrmarc.index.SolrIndexer; +import org.solrmarc.tools.SolrMarcIndexerException; + +/** + * Full text retrieval indexing routines. + */ +public class FullTextTools +{ + // Initialize logging category + static Logger logger = Logger.getLogger(FullTextTools.class.getName()); + + /** + * Load configurations for the full text parser. Return an array containing the + * parser type in the first element and the parser configuration in the second + * element. + * + * @return String[] + */ + public String[] getFulltextParserSettings() + { + String parserType = ConfigManager.instance().getConfigSetting( + "fulltext.ini", "General", "parser" + ); + if (null != parserType) { + parserType = parserType.toLowerCase(); + } + + // Is Aperture active? + String aperturePath = ConfigManager.instance().getConfigSetting( + "fulltext.ini", "Aperture", "webcrawler" + ); + if ((null == parserType && null != aperturePath) + || (null != parserType && parserType.equals("aperture")) + ) { + String[] array = { "aperture", aperturePath }; + return array; + } + + // Is Tika active? + String tikaPath = ConfigManager.instance().getConfigSetting( + "fulltext.ini", "Tika", "path" + ); + if ((null == parserType && null != tikaPath) + || (null != parserType && parserType.equals("tika")) + ) { + String[] array = { "tika", tikaPath }; + return array; + } + + // No recognized parser found: + String[] array = { "none", null }; + return array; + } + + /** + * Extract full-text from the documents referenced in the tags + * + * @param Record record current MARC record + * @param String field spec to search for URLs + * @param String only harvest files matching this extension (null for all) + * @return String The full-text + */ + public String getFulltext(Record record, String fieldSpec, String extension) { + String result = ""; + + // Get the web crawler settings (and return no text if it is unavailable) + String[] parserSettings = getFulltextParserSettings(); + if (parserSettings[0].equals("none")) { + return null; + } + + // Loop through the specified MARC fields: + Set<String> fields = SolrIndexer.instance().getFieldList(record, fieldSpec); + Iterator<String> fieldsIter = fields.iterator(); + if (fields != null) { + while(fieldsIter.hasNext()) { + // Get the current string to work on (and sanitize spaces): + String current = fieldsIter.next().replaceAll(" ", "%20"); + // Filter by file extension + if (extension == null || current.endsWith(extension)) { + // Load the parser output for each tag into a string + result = result + harvestWithParser(current, parserSettings); + } + } + } + // return string to SolrMarc + return result; + } + + /** + * Extract full-text from the documents referenced in the tags + * + * @param Record record current MARC record + * @param String field spec to search for URLs + * @return String The full-text + */ + public String getFulltext(Record record, String fieldSpec) { + return getFulltext(record, fieldSpec, null); + } + + /** + * Extract full-text from the documents referenced in the tags + * + * @param Record record current MARC record + * @return String The full-text + */ + public String getFulltext(Record record) { + return getFulltext(record, "856u", null); + } + + /** + * Clean up XML data generated by Aperture + * + * @param f file to clean + * @return a fixed version of the file + */ + public File sanitizeApertureOutput(File f) throws IOException + { + //clean up the aperture xml output + File tempFile = File.createTempFile("buffer", ".tmp"); + FileOutputStream fw = new FileOutputStream(tempFile); + OutputStreamWriter writer = new OutputStreamWriter(fw, "UTF8"); + + //delete this control character from the File and save + FileReader fr = new FileReader(f); + BufferedReader br = new BufferedReader(fr); + while (br.ready()) { + writer.write(sanitizeFullText(br.readLine())); + } + writer.close(); + br.close(); + fr.close(); + + return tempFile; + } + + /** + * Clean up bad characters in the full text. + * + * @param text text to clean + * @return cleaned text + */ + public String sanitizeFullText(String text) + { + String badChars = "[^\\u0009\\u000A\\u000D\\u0020-\\uD7FF\\uE000-\\uFFFD\\u10000-\\u10FFFF]+"; + return text.replaceAll(badChars, " "); + } + + /** + * Harvest the contents of a document file (PDF, Word, etc.) using Aperture. + * This method will only work if Aperture is properly configured in the + * fulltext.ini file. Without proper configuration, this will simply return an + * empty string. + * + * @param url the url extracted from the MARC tag. + * @param aperturePath The path to Aperture + * @return full-text extracted from url + */ + public String harvestWithAperture(String url, String aperturePath) { + String plainText = ""; + // Create temp file. + File f = null; + try { + f = File.createTempFile("apt", ".txt"); + } catch (Throwable e) { + dieWithError("Unable to create temporary file for full text harvest."); + } + + // Delete temp file when program exits. + f.deleteOnExit(); + + // Construct the command to call Aperture + String cmd = aperturePath + " -o " + f.getAbsolutePath().toString() + " -x " + url; + + // Call Aperture + //System.out.println("Loading fulltext from " + url + ". Please wait ..."); + try { + Process p = Runtime.getRuntime().exec(cmd); + + // Debugging output + /* + BufferedReader stdInput = new BufferedReader(new + InputStreamReader(p.getInputStream())); + String s; + while ((s = stdInput.readLine()) != null) { + System.out.println(s); + } + */ + + // Wait for Aperture to finish + p.waitFor(); + } catch (Throwable e) { + logger.error("Problem executing Aperture -- " + e.getMessage()); + } + + // Parse Aperture XML output + Document xmlDoc = null; + try { + DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); + DocumentBuilder db = dbf.newDocumentBuilder(); + File tempFile = sanitizeApertureOutput(f); + xmlDoc = db.parse(tempFile); + NodeList nl = xmlDoc.getElementsByTagName("plainTextContent"); + if(nl != null && nl.getLength() > 0) { + Node node = nl.item(0); + if (node.getNodeType() == Node.ELEMENT_NODE) { + plainText = plainText + node.getTextContent(); + } + } + + // we'll hold onto the temp file if it failed to parse for debugging; + // only set it up to be deleted if we've made it this far successfully. + tempFile.deleteOnExit(); + } catch (Throwable e) { + logger.error("Problem parsing Aperture XML -- " + e.getMessage()); + } + + return plainText; + } + + /** + * Harvest the contents of a document file (PDF, Word, etc.) using Tika. + * This method will only work if Tika is properly configured in the fulltext.ini + * file. Without proper configuration, this will simply return an empty string. + * + * @param url the url extracted from the MARC tag. + * @param scraperPath path to Tika + * @return the full-text + */ + public String harvestWithTika(String url, String scraperPath) { + + // Construct the command + String cmd = "java -jar " + scraperPath + " -t -eUTF8 " + url; + + StringBuilder stringBuilder= new StringBuilder(); + + // Call our scraper + //System.out.println("Loading fulltext from " + url + ". Please wait ..."); + try { + Process p = Runtime.getRuntime().exec(cmd); + BufferedReader stdInput = new BufferedReader(new + InputStreamReader(p.getInputStream(), "UTF8")); + + // We'll build the string from the command output + String s; + while ((s = stdInput.readLine()) != null) { + stringBuilder.append(s); + } + } catch (Throwable e) { + logger.error("Problem with Tika -- " + e.getMessage()); + } + + return sanitizeFullText(stringBuilder.toString()); + } + + /** + * Harvest the contents of a document file (PDF, Word, etc.) using the active parser. + * + * @param url the URL extracted from the MARC tag. + * @param settings configuration settings from {@code getFulltextParserSettings}. + * @return the full-text + */ + public String harvestWithParser(String url, String[] settings) { + if (settings[0].equals("aperture")) { + return harvestWithAperture(url, settings[1]); + } else if (settings[0].equals("tika")) { + return harvestWithTika(url, settings[1]); + } + return null; + } + + /** + * Log an error message and throw a fatal exception. + * @param msg message to log + */ + private void dieWithError(String msg) + { + logger.error(msg); + throw new SolrMarcIndexerException(SolrMarcIndexerException.EXIT, msg); + } +} \ No newline at end of file diff --git a/import/index_java/src/org/vufind/index/GeoTools.java b/import/index_java/src/org/vufind/index/GeoTools.java new file mode 100644 index 0000000000000000000000000000000000000000..6fd41eab3b919815dca7bd5c2d35e24e6dae1f13 --- /dev/null +++ b/import/index_java/src/org/vufind/index/GeoTools.java @@ -0,0 +1,300 @@ +package org.vufind.index; +/** + * Geographic indexing routines. + * + * This code is designed to get latitude and longitude coordinates. + * Records can have multiple coordinates sets of points and/or rectangles. + * Points are represented by coordinate sets where N=S E=W. + * + * code adapted from xrosecky - Moravian Library + * https://github.com/moravianlibrary/VuFind-2.x/blob/master/import/index_scripts/geo.bsh + * and incorporates legacy VuFind functionality for GoogleMap display. + * + * Copyright (C) Villanova University 2017. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +import java.util.ArrayList; +import java.util.Iterator; +import org.marc4j.marc.Record; +import org.marc4j.marc.VariableField; +import org.marc4j.marc.DataField; +import org.marc4j.marc.Subfield; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Geographic indexing routines. + */ +public class GeoTools +{ + private static final Pattern COORDINATES_PATTERN = Pattern.compile("^([eEwWnNsS])(\\d{3})(\\d{2})(\\d{2})"); + private static final Pattern HDMSHDD_PATTERN = Pattern.compile("^([eEwWnNsS])(\\d+(\\.\\d+)?)"); + private static final Pattern PMDD_PATTERN = Pattern.compile("^([+-])(\\d+(\\.\\d+)?)"); + + /** + * Convert MARC coordinates into location_geo format. + * + * @param Record record + * @return List geo_coordinates + */ + public List<String> getAllCoordinates(Record record) { + List<String> geo_coordinates = new ArrayList<String>(); + List<VariableField> list034 = record.getVariableFields("034"); + if (list034 != null) { + for (VariableField vf : list034) { + DataField df = (DataField) vf; + String d = df.getSubfield('d').getData(); + String e = df.getSubfield('e').getData(); + String f = df.getSubfield('f').getData(); + String g = df.getSubfield('g').getData(); + //System.out.println("raw Coords: "+d+" "+e+" "+f+" "+g); + + // Check to see if there are only 2 coordinates + // If so, copy them into the corresponding coordinate fields + if ((d !=null && (e == null || e.trim().equals(""))) && (f != null && (g==null || g.trim().equals("")))) { + e = d; + g = f; + } + if ((e !=null && (d == null || d.trim().equals(""))) && (g != null && (f==null || f.trim().equals("")))) { + d = e; + f = g; + } + + // Check and convert coordinates to +/- decimal degrees + Double west = convertCoordinate(d); + Double east = convertCoordinate(e); + Double north = convertCoordinate(f); + Double south = convertCoordinate(g); + + // New Format for indexing coordinates in Solr 5.0 - minX, maxX, maxY, minY + // Note - storage in Solr follows the WENS order, but display is WSEN order + String result = String.format("ENVELOPE(%s,%s,%s,%s)", new Object[] { west, east, north, south }); + + if (validateCoordinates(west, east, north, south)) { + geo_coordinates.add(result); + } + } + } + return geo_coordinates; + } + + /** + * Get point coordinates for GoogleMap display. + * + * @param Record record + * @return List coordinates + */ + public List<String> getPointCoordinates(Record record) { + List<String> coordinates = new ArrayList<String>(); + List<VariableField> list034 = record.getVariableFields("034"); + if (list034 != null) { + for (VariableField vf : list034) { + DataField df = (DataField) vf; + String d = df.getSubfield('d').getData(); + String e = df.getSubfield('e').getData(); + String f = df.getSubfield('f').getData(); + String g = df.getSubfield('g').getData(); + + // Check to see if there are only 2 coordinates + if ((d !=null && (e == null || e.trim().equals(""))) && (f != null && (g==null || g.trim().equals("")))) { + Double long_val = convertCoordinate(d); + Double lat_val = convertCoordinate(f); + String longlatCoordinate = Double.toString(long_val) + ',' + Double.toString(lat_val); + coordinates.add(longlatCoordinate); + } + if ((e !=null && (d == null || d.trim().equals(""))) && (g != null && (f==null || f.trim().equals("")))) { + Double long_val = convertCoordinate(e); + Double lat_val = convertCoordinate(g); + String longlatCoordinate = Double.toString(long_val) + ',' + Double.toString(lat_val); + coordinates.add(longlatCoordinate); + } + // Check if N=S and E=W + if (d.equals(e) && f.equals(g)) { + Double long_val = convertCoordinate(d); + Double lat_val = convertCoordinate(f); + String longlatCoordinate = Double.toString(long_val) + ',' + Double.toString(lat_val); + coordinates.add(longlatCoordinate); + } + } + } + return coordinates; + } + + /** + * Get all available coordinates from the record. + * + * @param Record record + * @return List geo_coordinates + */ + public List<String> getDisplayCoordinates(Record record) { + List<String> geo_coordinates = new ArrayList<String>(); + List<VariableField> list034 = record.getVariableFields("034"); + if (list034 != null) { + for (VariableField vf : list034) { + DataField df = (DataField) vf; + String west = df.getSubfield('d').getData(); + String east = df.getSubfield('e').getData(); + String north = df.getSubfield('f').getData(); + String south = df.getSubfield('g').getData(); + String result = String.format("%s %s %s %s", new Object[] { west, east, north, south }); + if (west != null || east != null || north != null || south != null) { + geo_coordinates.add(result); + } + } + } + return geo_coordinates; + } + + /** + * Check coordinate type HDMS HDD or +/-DD. + * + * @param String coordinateStr + * @return Double coordinate + */ + protected Double convertCoordinate(String coordinateStr) { + Double coordinate = Double.NaN; + Matcher HDmatcher = HDMSHDD_PATTERN.matcher(coordinateStr); + Matcher PMDmatcher = PMDD_PATTERN.matcher(coordinateStr); + if (HDmatcher.matches()) { + String hemisphere = HDmatcher.group(1).toUpperCase(); + Double degrees = Double.parseDouble(HDmatcher.group(2)); + // Check for HDD or HDMS + if (hemisphere.equals("N") || hemisphere.equals("S")) { + if (degrees > 90) { + String hdmsCoordinate = hemisphere+"0"+HDmatcher.group(2); + coordinate = coordinateToDecimal(hdmsCoordinate); + } else { + coordinate = Double.parseDouble(HDmatcher.group(2)); + if (hemisphere.equals("S")) { + coordinate *= -1; + } + } + } + if (hemisphere.equals("E") || hemisphere.equals("W")) { + if (degrees > 180) { + String hdmsCoordinate = HDmatcher.group(0); + coordinate = coordinateToDecimal(hdmsCoordinate); + } else { + coordinate = Double.parseDouble(HDmatcher.group(2)); + if (hemisphere.equals("W")) { + coordinate *= -1; + } + } + } + return coordinate; + } else if (PMDmatcher.matches()) { + String hemisphere = PMDmatcher.group(1); + coordinate = Double.parseDouble(PMDmatcher.group(2)); + if (hemisphere.equals("-")) { + coordinate *= -1; + } + return coordinate; + } else { + return null; + } + } + + /** + * Convert HDMS coordinates to decimal degrees. + * + * @param String coordinateStr + * @return Double coordinate + */ + protected Double coordinateToDecimal(String coordinateStr) { + Matcher matcher = COORDINATES_PATTERN.matcher(coordinateStr); + if (matcher.matches()) { + String hemisphere = matcher.group(1).toUpperCase(); + int degrees = Integer.parseInt(matcher.group(2)); + int minutes = Integer.parseInt(matcher.group(3)); + int seconds = Integer.parseInt(matcher.group(4)); + double coordinate = degrees + (minutes / 60.0) + (seconds / 3600.0); + if (hemisphere.equals("W") || hemisphere.equals("S")) { + coordinate *= -1; + } + return coordinate; + } + return null; + } + + /** + * Check decimal degree coordinates to make sure they are valid. + * + * @param Double west, east, north, south + * @return boolean + */ + protected boolean validateCoordinates(Double west, Double east, Double north, Double south) { + if (west == null || east == null || north == null || south == null) { + return false; + } + if (west > 180.0 || west < -180.0 || east > 180.0 || east < -180.0) { + return false; + } + if (north > 90.0 || north < -90.0 || south > 90.0 || south < -90.0) { + return false; + } + if (north < south || west > east) { + return false; + } + return true; + } + + /** + * THIS FUNCTION HAS BEEN DEPRECATED. + * Determine the longitude and latitude of the items location. + * + * @param record current MARC record + * @return string of form "longitude, latitude" + * @deprecated + */ + public String getLongLat(Record record) { + // Check 034 subfield d and f + List<VariableField> fields = record.getVariableFields("034"); + Iterator<VariableField> fieldsIter = fields.iterator(); + if (fields != null) { + DataField physical; + while(fieldsIter.hasNext()) { + physical = (DataField) fieldsIter.next(); + String val = null; + + List<Subfield> subfields_d = physical.getSubfields('d'); + Iterator<Subfield> subfieldsIter_d = subfields_d.iterator(); + if (subfields_d != null) { + while (subfieldsIter_d.hasNext()) { + val = subfieldsIter_d.next().getData().trim(); + if (!val.matches("-?\\d+(.\\d+)?")) { + return null; + } + } + } + List<Subfield> subfields_f = physical.getSubfields('f'); + Iterator<Subfield> subfieldsIter_f = subfields_f.iterator(); + if (subfields_f != null) { + while (subfieldsIter_f.hasNext()) { + String val2 = subfieldsIter_f.next().getData().trim(); + if (!val2.matches("-?\\d+(.\\d+)?")) { + return null; + } + val = val + ',' + val2; + } + } + return val; + } + } + //otherwise return null + return null; + } +} \ No newline at end of file diff --git a/import/index_java/src/org/vufind/index/IllustrationTools.java b/import/index_java/src/org/vufind/index/IllustrationTools.java new file mode 100644 index 0000000000000000000000000000000000000000..36578eff7df8653ad77581a3147a747932830f97 --- /dev/null +++ b/import/index_java/src/org/vufind/index/IllustrationTools.java @@ -0,0 +1,104 @@ +package org.vufind.index; +/** + * Illustration indexing routines. + * + * Copyright (C) Villanova University 2017. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +import java.util.Iterator; +import org.marc4j.marc.Record; +import org.marc4j.marc.VariableField; +import org.marc4j.marc.ControlField; +import org.marc4j.marc.DataField; +import org.marc4j.marc.Subfield; +import java.util.List; + +/** + * Illustration indexing routines. + */ +public class IllustrationTools +{ + /** + * Determine if a record is illustrated. + * + * @param record current MARC record + * @return "Illustrated" or "Not Illustrated" + */ + public String isIllustrated(Record record) { + String leader = record.getLeader().toString(); + + // Does the leader indicate this is a "language material" that might have extra + // illustration details in the fixed fields? + if (leader.charAt(6) == 'a') { + String currentCode = ""; // for use in loops below + + // List of 008/18-21 codes that indicate illustrations: + String illusCodes = "abcdefghijklmop"; + + // Check the illustration characters of the 008: + ControlField fixedField = (ControlField) record.getVariableField("008"); + if (fixedField != null) { + String fixedFieldText = fixedField.getData().toLowerCase(); + for (int i = 18; i <= 21; i++) { + if (i < fixedFieldText.length()) { + currentCode = fixedFieldText.substring(i, i + 1); + if (illusCodes.contains(currentCode)) { + return "Illustrated"; + } + } + } + } + + // Now check if any 006 fields apply: + List<VariableField> fields = record.getVariableFields("006"); + Iterator<VariableField> fieldsIter = fields.iterator(); + if (fields != null) { + while(fieldsIter.hasNext()) { + fixedField = (ControlField) fieldsIter.next(); + String fixedFieldText = fixedField.getData().toLowerCase(); + for (int i = 1; i <= 4; i++) { + if (i < fixedFieldText.length()) { + currentCode = fixedFieldText.substring(i, i + 1); + if (illusCodes.contains(currentCode)) { + return "Illustrated"; + } + } + } + } + } + } + + // Now check for interesting strings in 300 subfield b: + List<VariableField> fields = record.getVariableFields("300"); + Iterator<VariableField> fieldsIter = fields.iterator(); + if (fields != null) { + DataField physical; + while(fieldsIter.hasNext()) { + physical = (DataField) fieldsIter.next(); + List<Subfield> subfields = physical.getSubfields('b'); + for (Subfield sf: subfields) { + String desc = sf.getData().toLowerCase(); + if (desc.contains("ill.") || desc.contains("illus.")) { + return "Illustrated"; + } + } + } + } + + // If we made it this far, we found no sign of illustrations: + return "Not Illustrated"; + } +} \ No newline at end of file diff --git a/import/index_java/src/org/vufind/index/LccnTools.java b/import/index_java/src/org/vufind/index/LccnTools.java new file mode 100644 index 0000000000000000000000000000000000000000..65536146eca97e2d4b1ee2191e8f1e4eb7ec3f9b --- /dev/null +++ b/import/index_java/src/org/vufind/index/LccnTools.java @@ -0,0 +1,157 @@ +package org.vufind.index; +/** + * LCCN indexing routines. + * + * Copyright (C) Villanova University 2017. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +import org.marc4j.marc.Record; +import org.solrmarc.index.SolrIndexer; +import java.util.Iterator; +import java.util.LinkedHashSet; +import java.util.Set; + +/** + * LCCN indexing routines. + */ +public class LccnTools +{ + /** + * Normalize a single LCCN using the procedure specified at: + * http://www.loc.gov/marc/lccn-namespace.html#normalization + * @param lccn + * @return Normalized LCCN + */ + public String getNormalizedLCCN(String lccn) { + // Remove whitespace: + lccn = lccn.replaceAll(" ", ""); + + // Chop off anything following a forward slash: + String[] parts = lccn.split("/", 2); + lccn = parts[0]; + + // Normalize any characters following a hyphen to at least six digits: + parts = lccn.split("-", 2); + if (parts.length > 1) { + String secondPart = parts[1]; + while (secondPart.length() < 6) { + secondPart = "0" + secondPart; + } + lccn = parts[0] + secondPart; + } + + // Send back normalized LCCN: + return lccn; + } + + /** + * Extract LCCNs from a record and return them in a normalized format + * @param record + * @param fieldSpec + * @return Set of normalized LCCNs + */ + public Set getNormalizedLCCNs(Record record, String fieldSpec) { + // Initialize return value: + Set result = new LinkedHashSet(); + + // Loop through relevant fields and normalize everything: + Set<String> lccns = SolrIndexer.instance().getFieldList(record, fieldSpec); + Iterator<String> lccnIter = lccns.iterator(); + if (lccns != null) { + String current; + while(lccnIter.hasNext()) { + current = getNormalizedLCCN(lccnIter.next()); + if (current != null && current.length() > 0) { + result.add(current); + } + } + } + + // Send back results: + return result; + } + + /** + * Extract LCCNs from a record and return them in a normalized format + * @param record + * @return Set of normalized LCCNs + */ + public Set getNormalizedLCCNs(Record record) { + // Send in a default fieldSpec if none was provided by the user: + return getNormalizedLCCNs(record, "010a"); + } + + /** + * Extract the first valid LCCN from a record and return it in a normalized format + * with an optional prefix added (helpful for guaranteeing unique IDs) + * @param indexer + * @param record + * @param fieldSpec + * @param prefix + * @return Normalized LCCN + */ + public String getFirstNormalizedLCCN(SolrIndexer indexer, + Record record, String fieldSpec, String prefix) { + // Loop through relevant fields in search of first valid LCCN: + Set<String> lccns = SolrIndexer.instance().getFieldList(record, fieldSpec); + Iterator<String> lccnIter = lccns.iterator(); + if (lccns != null) { + String current; + while(lccnIter.hasNext()) { + current = getNormalizedLCCN(lccnIter.next()); + if (current != null && current.length() > 0) { + return prefix + current; + } + } + } + + // If we got this far, we couldn't find a valid value: + return null; + } + + /** + * Extract the first valid LCCN from a record and return it in a normalized format + * with an optional prefix added (helpful for guaranteeing unique IDs) + * @param record + * @param fieldSpec + * @param prefix + * @return Normalized LCCN + */ + public String getFirstNormalizedLCCN(Record record, String fieldSpec, String prefix) { + return getFirstNormalizedLCCN(SolrIndexer.instance(), record, fieldSpec, prefix); + } + + /** + * Extract the first valid LCCN from a record and return it in a normalized format + * @param record + * @param fieldSpec + * @return Normalized LCCN + */ + public String getFirstNormalizedLCCN(Record record, String fieldSpec) { + // Send in a default prefix if none was provided by the user: + return getFirstNormalizedLCCN(SolrIndexer.instance(), record, fieldSpec, ""); + } + + /** + * Extract the first valid LCCN from a record and return it in a normalized format + * @param record + * @return Normalized LCCN + */ + public String getFirstNormalizedLCCN(Record record) { + // Send in a default fieldSpec/prefix if none were provided by the user: + return getFirstNormalizedLCCN(SolrIndexer.instance(), record, "010a", ""); + } +} \ No newline at end of file diff --git a/import/index_java/src/org/vufind/index/PublisherTools.java b/import/index_java/src/org/vufind/index/PublisherTools.java new file mode 100644 index 0000000000000000000000000000000000000000..2fe8f2e3c69c4db9fd4f1d08965631c04355ccd2 --- /dev/null +++ b/import/index_java/src/org/vufind/index/PublisherTools.java @@ -0,0 +1,91 @@ +package org.vufind.index; +/** + * Publisher indexing routines. + * + * Copyright (C) Villanova University 2017. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +import org.marc4j.marc.Record; +import org.marc4j.marc.VariableField; +import org.marc4j.marc.DataField; +import org.marc4j.marc.Subfield; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; + +/** + * Publisher indexing routines. + */ +public class PublisherTools +{ + /** + * Get all available publishers from the record. + * + * @param record MARC record + * @return set of publishers + */ + public Set<String> getPublishers(final Record record) { + Set<String> publishers = new LinkedHashSet<String>(); + + // First check old-style 260b name: + List<VariableField> list260 = record.getVariableFields("260"); + for (VariableField vf : list260) + { + DataField df = (DataField) vf; + String currentString = ""; + for (Subfield current : df.getSubfields('b')) { + currentString = currentString.trim().concat(" " + current.getData()).trim(); + } + if (currentString.length() > 0) { + publishers.add(currentString); + } + } + + // Now track down relevant RDA-style 264b names; we only care about + // copyright and publication names (and ignore copyright names if + // publication names are present). + Set<String> pubNames = new LinkedHashSet<String>(); + Set<String> copyNames = new LinkedHashSet<String>(); + List<VariableField> list264 = record.getVariableFields("264"); + for (VariableField vf : list264) + { + DataField df = (DataField) vf; + String currentString = ""; + for (Subfield current : df.getSubfields('b')) { + currentString = currentString.trim().concat(" " + current.getData()).trim(); + } + if (currentString.length() > 0) { + char ind2 = df.getIndicator2(); + switch (ind2) + { + case '1': + pubNames.add(currentString); + break; + case '4': + copyNames.add(currentString); + break; + } + } + } + if (pubNames.size() > 0) { + publishers.addAll(pubNames); + } else if (copyNames.size() > 0) { + publishers.addAll(copyNames); + } + + return publishers; + } +} \ No newline at end of file diff --git a/import/index_java/src/org/vufind/index/PunctuationTools.java b/import/index_java/src/org/vufind/index/PunctuationTools.java new file mode 100644 index 0000000000000000000000000000000000000000..e71f2b62490913d5b8af3d0a70e2862785e289d0 --- /dev/null +++ b/import/index_java/src/org/vufind/index/PunctuationTools.java @@ -0,0 +1,57 @@ +package org.vufind.index; +/** + * Punctuation indexing routines. + * + * Copyright (C) Villanova University 2017. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +import org.marc4j.marc.Record; +import java.util.LinkedHashSet; +import java.util.Set; +import java.util.regex.Pattern; +import org.solrmarc.index.SolrIndexer; + +/** + * Punctuation indexing routines. + */ +public class PunctuationTools +{ + /** + * Normalize trailing punctuation. This mimics the functionality built into VuFind's + * textFacet field type, so that you can get equivalent values when indexing into + * a string field. (Useful for docValues support). + * + * Can return null + * + * @param record current MARC record + * @param fieldSpec which MARC fields / subfields need to be analyzed + * @return Set containing normalized values + */ + public Set<String> normalizeTrailingPunctuation(Record record, String fieldSpec) { + // Initialize our return value: + Set<String> result = new LinkedHashSet<String>(); + + // Loop through the specified MARC fields: + Set<String> input = SolrIndexer.instance().getFieldList(record, fieldSpec); + Pattern pattern = Pattern.compile("(?<!\b[A-Z])[.\\s]*$"); + for (String current: input) { + result.add(pattern.matcher(current).replaceAll("")); + } + + // If we found no matches, return null; otherwise, return our results: + return result.isEmpty() ? null : result; + } +} \ No newline at end of file diff --git a/import/index_java/src/org/vufind/index/ReadingProgramTools.java b/import/index_java/src/org/vufind/index/ReadingProgramTools.java new file mode 100644 index 0000000000000000000000000000000000000000..e342bab130082bded56a9e853b6c60cbcea967f6 --- /dev/null +++ b/import/index_java/src/org/vufind/index/ReadingProgramTools.java @@ -0,0 +1,90 @@ +package org.vufind.index; +/** + * Reading program logic courtesy of Chanel Wheeler + * + * Example usage: + * + * #### In marc_local.properties, insert this: + * arLvel = custom, getARLevel, (pattern_map.level) + * rcLevel = custom, getRCLevel, (pattern_map.level) + * pattern_map.level.pattern_0 = ([0-9]\\.[0-9]).*=>$1 + * + * #### In solr/vufind/biblio/conf/schema.xml (I'm not aware of any way to localize this), + * #### add this in the <types> section: + * <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" positionIncrementGap="0"/> + * + * #### In solr/vufind/biblio/conf/schema.xml, add this in the <fields> section + * <field name="arLevel" type="tfloat" indexed="true" stored="true"/> + * <field name="rcLevel" type="tfloat" indexed="true" stored="true"/> + * + * Copyright (C) Villanova University 2017. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +import org.marc4j.marc.Record; +import org.marc4j.marc.DataField; +import org.marc4j.marc.VariableField; +import java.util.Iterator; +import java.util.List; + +/** + * Reading program logic courtesy of Chanel Wheeler + */ +public class ReadingProgramTools +{ + /** + * Get reading level for Accelerated Reader items + * + * @param record + * @return AR level + */ + public String getARLevel(Record record) { + List readingprograms = record.getVariableFields("526"); + if (readingprograms != null) { + Iterator<VariableField> rpIter = readingprograms.iterator(); + while(rpIter.hasNext()) { + DataField rp = (DataField) rpIter.next(); + if (rp.getSubfield('a') != null){ + if (rp.getSubfield('a').getData().toLowerCase().contains("accelerated reader")) { + return rp.getSubfield('c').getData(); + } + } + } + } + return null; + } + + /** + * Get reading level for Reading Counts items + * + * @param record + * @return RC level + */ + public String getRCLevel(Record record) { + List readingprograms = record.getVariableFields("526"); + if (readingprograms != null) { + Iterator<VariableField> rpIter = readingprograms.iterator(); + while(rpIter.hasNext()) { + DataField rp = (DataField) rpIter.next(); + if (rp.getSubfield('a') != null){ + if (rp.getSubfield('a').getData().toLowerCase().contains("reading counts")) { + return rp.getSubfield('c').getData(); + } + } + } + } + return null; + } +} \ No newline at end of file diff --git a/import/index_java/src/org/vufind/index/RelatorContainer.java b/import/index_java/src/org/vufind/index/RelatorContainer.java new file mode 100644 index 0000000000000000000000000000000000000000..5d2177e80a35519e466162533de6a767b12921e3 --- /dev/null +++ b/import/index_java/src/org/vufind/index/RelatorContainer.java @@ -0,0 +1,57 @@ +package org.vufind.index; +/** + * Singleton for storing relator information. + * + * Copyright (C) Villanova University 2017. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +import java.util.LinkedHashSet; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; + +/** + * Singleton for storing relator information. + */ +public class RelatorContainer +{ + private static ThreadLocal<RelatorContainer> containerCache = + new ThreadLocal<RelatorContainer>() + { + @Override + protected RelatorContainer initialValue() + { + return new RelatorContainer(); + } + }; + + private ConcurrentHashMap<String, String> relatorSynonymLookup = new ConcurrentHashMap<String, String>(); + private Set<String> knownRelators = new LinkedHashSet<String>(); + + public ConcurrentHashMap<String, String> getSynonymLookup() + { + return relatorSynonymLookup; + } + + public Set<String> getKnownRelators() + { + return knownRelators; + } + + public static RelatorContainer instance() + { + return containerCache.get(); + } +} \ No newline at end of file diff --git a/import/index_java/src/org/vufind/index/UpdateDateTools.java b/import/index_java/src/org/vufind/index/UpdateDateTools.java new file mode 100644 index 0000000000000000000000000000000000000000..e09517266d1ff734f222d6d19801c0673d0107bf --- /dev/null +++ b/import/index_java/src/org/vufind/index/UpdateDateTools.java @@ -0,0 +1,223 @@ +package org.vufind.index; +/** + * Indexing routines using the UpdateDateTracker. + * + * Copyright (C) Villanova University 2017. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +import java.util.Iterator; +import java.util.Set; +import java.text.SimpleDateFormat; +import org.solrmarc.index.SolrIndexer; +import org.solrmarc.tools.SolrMarcIndexerException; +import org.marc4j.marc.Record; +import org.apache.log4j.Logger; + +/** + * Indexing routines using the UpdateDateTracker. + */ +public class UpdateDateTools +{ + // Initialize logging category + static Logger logger = Logger.getLogger(UpdateDateTools.class.getName()); + + // the SimpleDateFormat class is not Thread-safe the below line were changes to be not static + // which given the rest of the design of SolrMarc will make them work correctly. + private SimpleDateFormat marc005date = new SimpleDateFormat("yyyyMMddHHmmss.S"); + private SimpleDateFormat marc008date = new SimpleDateFormat("yyMMdd"); + + /** + * Support method for getLatestTransaction. + * @return Date extracted from 005 (or very old date, if unavailable) + */ + private java.util.Date normalize005Date(String input) + { + // Normalize "null" strings to a generic bad value: + if (input == null) { + input = "null"; + } + + // Try to parse the date; default to "millisecond 0" (very old date) if we can't + // parse the data successfully. + java.util.Date retVal; + try { + retVal = marc005date.parse(input); + } catch(java.text.ParseException e) { + retVal = new java.util.Date(0); + } + return retVal; + } + + /** + * Support method for getLatestTransaction. + * @return Date extracted from 008 (or very old date, if unavailable) + */ + private java.util.Date normalize008Date(String input) + { + // Normalize "null" strings to a generic bad value: + if (input == null || input.length() < 6) { + input = "null"; + } + + // Try to parse the date; default to "millisecond 0" (very old date) if we can't + // parse the data successfully. + java.util.Date retVal; + try { + retVal = marc008date.parse(input.substring(0, 6)); + } catch(java.lang.StringIndexOutOfBoundsException e) { + retVal = new java.util.Date(0); + } catch(java.text.ParseException e) { + retVal = new java.util.Date(0); + } + return retVal; + } + + /** + * Extract the latest transaction date from the MARC record. This is useful + * for detecting when a record has changed since the last time it was indexed. + * + * @param record MARC record + * @return Latest transaction date. + */ + public java.util.Date getLatestTransaction(Record record) { + // First try the 005 -- this is most likely to have a precise transaction date: + Set<String> dates = SolrIndexer.instance().getFieldList(record, "005"); + if (dates != null) { + Iterator<String> dateIter = dates.iterator(); + if (dateIter.hasNext()) { + return normalize005Date(dateIter.next()); + } + } + + // No luck with 005? Try 008 next -- less precise, but better than nothing: + dates = SolrIndexer.instance().getFieldList(record, "008"); + if (dates != null) { + Iterator<String> dateIter = dates.iterator(); + if (dateIter.hasNext()) { + return normalize008Date(dateIter.next()); + } + } + + // If we got this far, we couldn't find a valid value; return an arbitrary date: + return new java.util.Date(0); + } + + + /** + * Update the index date in the database for the specified core/ID pair. We + * maintain a database of "first/last indexed" times separately from Solr to + * allow the history of our indexing activity to be stored permanently in a + * fashion that can survive even a total Solr rebuild. + */ + public void updateTracker(String core, String id, java.util.Date latestTransaction) + { + // Update the database (if necessary): + try { + UpdateDateTracker.instance().index(core, id, latestTransaction); + } catch (java.sql.SQLException e) { + // If we're in the process of shutting down, an error is expected: + if (!DatabaseManager.instance().isShuttingDown()) { + dieWithError("Unexpected database error"); + } + } + } + + /** + * Get the "first indexed" date for the current record. (This is the first + * time that SolrMarc ever encountered this particular record). + * + * @param record current MARC record + * @param fieldSpec fields / subfields to be analyzed + * @param core core name + * @return ID string + */ + public String getFirstIndexed(Record record, String fieldSpec, String core) { + // Update the database, then send back the first indexed date: + updateTracker(core, SolrIndexer.instance().getFirstFieldVal(record, fieldSpec), getLatestTransaction(record)); + return UpdateDateTracker.instance().getFirstIndexed(); + } + + /** + * Get the "first indexed" date for the current record. (This is the first + * time that SolrMarc ever encountered this particular record). + * + * @param record current MARC record + * @param fieldSpec fields / subfields to be analyzed + * @return ID string + */ + public String getFirstIndexed(Record record, String fieldSpec) { + return getFirstIndexed(record, fieldSpec, "biblio"); + } + + /** + * Get the "first indexed" date for the current record. (This is the first + * time that SolrMarc ever encountered this particular record). + * + * @param record current MARC record + * @return ID string + */ + public String getFirstIndexed(Record record) { + return getFirstIndexed(record, "001", "biblio"); + } + + /** + * Get the "last indexed" date for the current record. (This is the last time + * the record changed from SolrMarc's perspective). + * + * @param record current MARC record + * @param fieldSpec fields / subfields to be analyzed + * @param core core name + * @return ID string + */ + public String getLastIndexed(Record record, String fieldSpec, String core) { + // Update the database, then send back the last indexed date: + updateTracker(core, SolrIndexer.instance().getFirstFieldVal(record, fieldSpec), getLatestTransaction(record)); + return UpdateDateTracker.instance().getLastIndexed(); + } + + /** + * Get the "last indexed" date for the current record. (This is the last time + * the record changed from SolrMarc's perspective). + * + * @param record current MARC record + * @param fieldSpec fields / subfields to analyze + * @return ID string + */ + public String getLastIndexed(Record record, String fieldSpec) { + return getLastIndexed(record, fieldSpec, "biblio"); + } + + /** + * Get the "last indexed" date for the current record. (This is the last time + * the record changed from SolrMarc's perspective). + * + * @param record current MARC record + * @return ID string + */ + public String getLastIndexed(Record record) { + return getLastIndexed(record, "001", "biblio"); + } + + /** + * Log an error message and throw a fatal exception. + * @param msg message to log + */ + private void dieWithError(String msg) + { + logger.error(msg); + throw new SolrMarcIndexerException(SolrMarcIndexerException.EXIT, msg); + } +} \ No newline at end of file diff --git a/import/index_java/src/org/solrmarc/index/UpdateDateTracker.java b/import/index_java/src/org/vufind/index/UpdateDateTracker.java similarity index 91% rename from import/index_java/src/org/solrmarc/index/UpdateDateTracker.java rename to import/index_java/src/org/vufind/index/UpdateDateTracker.java index 5ceb0b1fbb5724403a4aa02bec0258ac00678daa..c43aa7a4077fb946e55b5a3d45bd36b76e310bb7 100644 --- a/import/index_java/src/org/solrmarc/index/UpdateDateTracker.java +++ b/import/index_java/src/org/vufind/index/UpdateDateTracker.java @@ -1,4 +1,4 @@ -package org.solrmarc.index; +package org.vufind.index; /** * Class for managing record update dates. * @@ -21,6 +21,9 @@ package org.solrmarc.index; import java.sql.*; import java.text.SimpleDateFormat; +/** + * Class for managing record update dates. + */ public class UpdateDateTracker { private Connection db; @@ -37,6 +40,25 @@ public class UpdateDateTracker PreparedStatement selectSql; PreparedStatement updateSql; + private static ThreadLocal<UpdateDateTracker> trackerCache = + new ThreadLocal<UpdateDateTracker>() + { + @Override + protected UpdateDateTracker initialValue() + { + try { + return new UpdateDateTracker(DatabaseManager.instance().getConnection()); + } catch (SQLException e) { + throw new RuntimeException(e.getMessage()); + } + } + }; + + public static UpdateDateTracker instance() + { + return trackerCache.get(); + } + /* Private support method: create a row in the change_tracker table. */ private void createRow(Timestamp newRecordChange) throws SQLException diff --git a/import/index_scripts/README_SCRIPTS b/import/index_scripts/README_SCRIPTS index ee6a073bef2fd04e71afdf40611e3bdd840422f4..69ad62c6d21c5964a59c60c466afd69f0e5d206d 100644 --- a/import/index_scripts/README_SCRIPTS +++ b/import/index_scripts/README_SCRIPTS @@ -1 +1,3 @@ -This is the directory in which you should place java-like beanshell scripts for handling custom indexing functions via dynamic scripts. \ No newline at end of file +This is the directory in which you should place java-like beanshell scripts for handling custom indexing functions via dynamic scripts. + +Note that use of beanshell is discouraged now that java can be dynamically compiled; see the index_java directory instead. \ No newline at end of file diff --git a/import/index_scripts/author.bsh b/import/index_scripts/author.bsh deleted file mode 100644 index 4bbbc395959b8098fada43068f0f57f05ae01739..0000000000000000000000000000000000000000 --- a/import/index_scripts/author.bsh +++ /dev/null @@ -1,692 +0,0 @@ -import org.marc4j.marc.Record; -import org.marc4j.marc.DataField; -import org.solrmarc.index.UpdateDateTracker; -import org.ini4j.Ini; -import java.util.Arrays; -import java.util.ArrayList; -import java.util.Map; - -// define the base level indexer so that its methods can be called from the script. -// note that the SolrIndexer code will set this value before the script methods are called. -org.solrmarc.index.SolrIndexer indexer = null; - -HashMap relatorSynonymLookup = new HashMap(); -Set knownRelators = new LinkedHashSet(); - -/** - * Extract all valid relator terms from a list of subfields using a whitelist. - * @param subfields List of subfields to check - * @param permittedRoles Whitelist to check against - * @param indexRawRelators Should we index relators raw, as found - * in the MARC (true) or index mapped versions (false)? - * @return Set of valid relator terms - */ -public Set getValidRelatorsFromSubfields(List subfields, List permittedRoles, Boolean indexRawRelators) -{ - Set relators = new LinkedHashSet(); - for (int j = 0; j < subfields.size(); j++) { - String raw = subfields.get(j).getData(); - String current = normalizeRelatorString(raw); - if (permittedRoles.contains(current)) { - relators.add(indexRawRelators ? raw : mapRelatorStringToCode(current)); - } - } - return relators; -} - -/** - * Is this relator term unknown to author-classification.ini? - * @param current relator to check - * @return True if unknown - */ -public Boolean isUnknownRelator(String current) -{ - // If we haven't loaded known relators yet, do so now: - if (knownRelators.size() == 0) { - Map all = indexer.getConfigSection("author-classification.ini", "RelatorSynonyms"); - for (String key : all.keySet()) { - knownRelators.add(normalizeRelatorString(key)); - for (String synonym: all.get(key).split("\\|")) { - knownRelators.add(normalizeRelatorString(synonym)); - } - } - } - return !knownRelators.contains(normalizeRelatorString(current)); -} - -/** - * Extract all valid relator terms from a list of subfields using a whitelist. - * @param subfields List of subfields to check - * @return Set of valid relator terms - */ -public Set getUnknownRelatorsFromSubfields(List subfields) -{ - Set relators = new LinkedHashSet(); - for (int j = 0; j < subfields.size(); j++) { - String current = subfields.get(j).getData().trim(); - if (current.length() > 0 && isUnknownRelator(current)) { - indexer.getLogger().info("Unknown relator: " + current); - relators.add(current); - } - } - return relators; -} - -/** - * Extract all values that meet the specified relator requirements. - * @param authorField Field to analyze - * @param noRelatorAllowed Array of tag names which are allowed to be used with - * no declared relator. - * @param relatorConfig The setting in author-classification.ini which - * defines which relator terms are acceptable (or a colon-delimited list) - * @param unknownRelatorAllowed Array of tag names whose relators should be indexed - * even if they are not listed in author-classification.ini. - * @param indexRawRelators Set to "true" to index relators raw, as found - * in the MARC or "false" to index mapped versions. - * @return Set - */ -public Set getValidRelators(DataField authorField, - String[] noRelatorAllowed, String relatorConfig, - String[] unknownRelatorAllowed, String indexRawRelators -) { - // get tag number from Field - String tag = authorField.getTag(); - List subfieldE = authorField.getSubfields('e'); - List subfield4 = authorField.getSubfields('4'); - - Set relators = new LinkedHashSet(); - - // if no relator is found, check to see if the current tag is in the "no - // relator allowed" list. - if (subfieldE.size() == 0 && subfield4.size() == 0) { - if (Arrays.asList(noRelatorAllowed).contains(tag)) { - relators.add(""); - } - } else { - // If we got this far, we need to figure out what type of relation they have - List permittedRoles = normalizeRelatorStringList(Arrays.asList(loadRelatorConfig(relatorConfig))); - relators.addAll(getValidRelatorsFromSubfields(subfieldE, permittedRoles, indexRawRelators.toLowerCase().equals("true"))); - relators.addAll(getValidRelatorsFromSubfields(subfield4, permittedRoles, indexRawRelators.toLowerCase().equals("true"))); - if (Arrays.asList(unknownRelatorAllowed).contains(tag)) { - Set unknown = getUnknownRelatorsFromSubfields(subfieldE); - if (unknown.size() == 0) { - unknown = getUnknownRelatorsFromSubfields(subfield4); - } - relators.addAll(unknown); - } - } - return relators; -} - -/** - * Parse a SolrMarc fieldspec into a map of tag name to set of subfield strings - * (note that we need to map to a set rather than a single string, because the - * same tag may repeat with different subfields to extract different sections - * of the same field into distinct values). - * - * @param tagList The field specification to parse - * @return HashMap - */ -public HashMap getParsedTagList(String tagList) -{ - String[] tags = tagList.split(":");//convert string input to array - HashMap tagMap = new HashMap(); - //cut tags array up into key/value pairs in hash map - Set currentSet; - for(int i = 0; i < tags.length; i++){ - String tag = tags[i].substring(0, 3); - if (!tagMap.containsKey(tag)) { - currentSet = new LinkedHashSet(); - tagMap.put(tag, currentSet); - } else { - currentSet = tagMap.get(tag); - } - currentSet.add(tags[i].substring(3)); - } - return tagMap; -} - -/** - * Filter values retrieved using tagList to include only those whose relator - * values are acceptable. Used for separating different types of authors. - * - * @param record The record (fed in automatically) - * @param tagList The field specification to read - * @param acceptWithoutRelator Colon-delimited list of tags whose values should - * be accepted even if no relator subfield is defined - * @param relatorConfig The setting in author-classification.ini which - * defines which relator terms are acceptable (or a colon-delimited list) - * @param acceptUnknownRelators Colon-delimited list of tags whose relators - * should be indexed even if they are not listed in author-classification.ini. - * @param indexRawRelators Set to "true" to index relators raw, as found - * in the MARC or "false" to index mapped versions. - * @param firstOnly Return first result only? - * @return List result - */ -public List getAuthorsFilteredByRelator(Record record, String tagList, - String acceptWithoutRelator, String relatorConfig, - String acceptUnknownRelators, String indexRawRelators, Boolean firstOnly -) { - List result = new LinkedList(); - String[] noRelatorAllowed = acceptWithoutRelator.split(":"); - String[] unknownRelatorAllowed = acceptUnknownRelators.split(":"); - HashMap parsedTagList = getParsedTagList(tagList); - List fields = indexer.getFieldSetMatchingTagList(record, tagList); - Iterator fieldsIter = fields.iterator(); - if (fields != null){ - DataField authorField; - while (fieldsIter.hasNext()){ - authorField = (DataField) fieldsIter.next(); - // add all author types to the result set; if we have multiple relators, repeat the authors - for (String iterator: getValidRelators(authorField, noRelatorAllowed, relatorConfig, unknownRelatorAllowed, indexRawRelators)) { - for (String subfields : parsedTagList.get(authorField.getTag())) { - String current = indexer.getDataFromVariableField(authorField, "["+subfields+"]", " ", false); - // TODO: we may eventually be able to use this line instead, - // but right now it's not handling separation between the - // subfields correctly, so it's commented out until that is - // fixed. - //String current = authorField.getSubfieldsAsString(subfields); - if (null != current) { - result.add(current); - if (firstOnly) { - return result; - } - } - } - } - } - } - return result; -} - -/** - * Filter values retrieved using tagList to include only those whose relator - * values are acceptable. Used for separating different types of authors. - * - * @param record The record (fed in automatically) - * @param tagList The field specification to read - * @param acceptWithoutRelator Colon-delimited list of tags whose values should - * be accepted even if no relator subfield is defined - * @param relatorConfig The setting in author-classification.ini which - * defines which relator terms are acceptable (or a colon-delimited list) - * @return List result - */ -public List getAuthorsFilteredByRelator(Record record, String tagList, - String acceptWithoutRelator, String relatorConfig -) { - // default firstOnly to false! - return getAuthorsFilteredByRelator( - record, tagList, acceptWithoutRelator, relatorConfig, - acceptWithoutRelator, "false", false - ); -} - -/** - * Filter values retrieved using tagList to include only those whose relator - * values are acceptable. Used for separating different types of authors. - * - * @param record The record (fed in automatically) - * @param tagList The field specification to read - * @param acceptWithoutRelator Colon-delimited list of tags whose values should - * be accepted even if no relator subfield is defined - * @param relatorConfig The setting in author-classification.ini which - * defines which relator terms are acceptable (or a colon-delimited list) - * @return List result - * @param acceptUnknownRelators Colon-delimited list of tags whose relators - * should be indexed even if they are not listed in author-classification.ini. - */ -public List getAuthorsFilteredByRelator(Record record, String tagList, - String acceptWithoutRelator, String relatorConfig, - String acceptUnknownRelators -) { - // default firstOnly to false! - return getAuthorsFilteredByRelator( - record, tagList, acceptWithoutRelator, relatorConfig, - acceptUnknownRelators, "false", false - ); -} - -/** - * Filter values retrieved using tagList to include only those whose relator - * values are acceptable. Used for separating different types of authors. - * - * @param record The record (fed in automatically) - * @param tagList The field specification to read - * @param acceptWithoutRelator Colon-delimited list of tags whose values should - * be accepted even if no relator subfield is defined - * @param relatorConfig The setting in author-classification.ini which - * defines which relator terms are acceptable (or a colon-delimited list) - * @return List result - * @param acceptUnknownRelators Colon-delimited list of tags whose relators - * should be indexed even if they are not listed in author-classification.ini. - * @param indexRawRelators Set to "true" to index relators raw, as found - * in the MARC or "false" to index mapped versions. - */ -public List getAuthorsFilteredByRelator(Record record, String tagList, - String acceptWithoutRelator, String relatorConfig, - String acceptUnknownRelators, String indexRawRelators -) { - // default firstOnly to false! - return getAuthorsFilteredByRelator( - record, tagList, acceptWithoutRelator, relatorConfig, - acceptUnknownRelators, indexRawRelators, false - ); -} - -/** - * If the provided relator is included in the synonym list, convert it back to - * a code (for better standardization/translation). - * - * @param relator Relator code to check - * @return Code version, if found, or raw string if no match found. - */ -public String mapRelatorStringToCode(String relator) -{ - String normalizedRelator = normalizeRelatorString(relator); - return relatorSynonymLookup.containsKey(normalizedRelator) - ? relatorSynonymLookup.get(normalizedRelator) : relator; -} - -/** - * Filter values retrieved using tagList to include only those whose relator - * values are acceptable. Used for separating different types of authors. - * - * @param record The record (fed in automatically) - * @param tagList The field specification to read - * @param acceptWithoutRelator Colon-delimited list of tags whose values should - * be accepted even if no relator subfield is defined - * @param relatorConfig The setting in author-classification.ini which - * defines which relator terms are acceptable (or a colon-delimited list) - * @param acceptUnknownRelators Colon-delimited list of tags whose relators - * should be indexed even if they are not listed in author-classification.ini. - * @param indexRawRelators Set to "true" to index relators raw, as found - * in the MARC or "false" to index mapped versions. - * @return String - */ -public String getFirstAuthorFilteredByRelator(Record record, String tagList, - String acceptWithoutRelator, String relatorConfig, - String acceptUnknownRelators, String indexRawRelators -) { - List result = getAuthorsFilteredByRelator( - record, tagList, acceptWithoutRelator, relatorConfig, - acceptUnknownRelators, indexRawRelators, true - ); - for (String s : result) { - return s; - } - return null; -} - -/** - * Filter values retrieved using tagList to include only those whose relator - * values are acceptable. Used for separating different types of authors. - * - * @param record The record (fed in automatically) - * @param tagList The field specification to read - * @param acceptWithoutRelator Colon-delimited list of tags whose values should - * be accepted even if no relator subfield is defined - * @param relatorConfig The setting in author-classification.ini which - * defines which relator terms are acceptable (or a colon-delimited list) - * @return String - */ -public String getFirstAuthorFilteredByRelator(Record record, String tagList, - String acceptWithoutRelator, String relatorConfig -) { - return getFirstAuthorFilteredByRelator( - record, tagList, acceptWithoutRelator, relatorConfig, - acceptWithoutRelator, "false" - ); -} - -/** - * Filter values retrieved using tagList to include only those whose relator - * values are acceptable. Used for separating different types of authors. - * - * @param record The record (fed in automatically) - * @param tagList The field specification to read - * @param acceptWithoutRelator Colon-delimited list of tags whose values should - * be accepted even if no relator subfield is defined - * @param relatorConfig The setting in author-classification.ini which - * defines which relator terms are acceptable (or a colon-delimited list) - * @param acceptUnknownRelators Colon-delimited list of tags whose relators - * should be indexed even if they are not listed in author-classification.ini. - * @return String - */ -public String getFirstAuthorFilteredByRelator(Record record, String tagList, - String acceptWithoutRelator, String relatorConfig, - String acceptUnknownRelators -) { - return getFirstAuthorFilteredByRelator( - record, tagList, acceptWithoutRelator, relatorConfig, - acceptUnknownRelators, "false" - ); -} - -/** - * Filter values retrieved using tagList to include only those whose relator - * values are acceptable. Used for saving relators of authors separated by different - * types. - * - * @param record The record (fed in automatically) - * @param tagList The field specification to read - * @param acceptWithoutRelator Colon-delimited list of tags whose values should - * be accepted even if no relator subfield is defined - * @param relatorConfig The setting in author-classification.ini which - * defines which relator terms are acceptable (or a colon-delimited list) - * @param acceptUnknownRelators Colon-delimited list of tags whose relators - * should be indexed even if they are not listed in author-classification.ini. - * @param indexRawRelators Set to "true" to index relators raw, as found - * in the MARC or "false" to index mapped versions. - * @param firstOnly Return first result only? - * @return List result - */ -public List getRelatorsFilteredByRelator(Record record, String tagList, - String acceptWithoutRelator, String relatorConfig, - String acceptUnknownRelators, String indexRawRelators, Boolean firstOnly -) { - List result = new LinkedList(); - String[] noRelatorAllowed = acceptWithoutRelator.split(":"); - String[] unknownRelatorAllowed = acceptUnknownRelators.split(":"); - HashMap parsedTagList = getParsedTagList(tagList); - List fields = indexer.getFieldSetMatchingTagList(record, tagList); - Iterator fieldsIter = fields.iterator(); - if (fields != null){ - DataField authorField; - while (fieldsIter.hasNext()){ - authorField = (DataField) fieldsIter.next(); - //add all author types to the result set - result.addAll(getValidRelators(authorField, noRelatorAllowed, relatorConfig, unknownRelatorAllowed, indexRawRelators)); - } - } - return result; -} - -/** - * Filter values retrieved using tagList to include only those whose relator - * values are acceptable. Used for saving relators of authors separated by different - * types. - * - * @param record The record (fed in automatically) - * @param tagList The field specification to read - * @param acceptWithoutRelator Colon-delimited list of tags whose values should - * be accepted even if no relator subfield is defined - * @param relatorConfig The setting in author-classification.ini which - * defines which relator terms are acceptable (or a colon-delimited list) - * @param acceptUnknownRelators Colon-delimited list of tags whose relators - * should be indexed even if they are not listed in author-classification.ini. - * @param indexRawRelators Set to "true" to index relators raw, as found - * in the MARC or "false" to index mapped versions. - * @return List result - */ -public List getRelatorsFilteredByRelator(Record record, String tagList, - String acceptWithoutRelator, String relatorConfig, - String acceptUnknownRelators, String indexRawRelators -) { - // default firstOnly to false! - return getRelatorsFilteredByRelator( - record, tagList, acceptWithoutRelator, relatorConfig, - acceptUnknownRelators, indexRawRelators, false - ); -} - -/** - * Filter values retrieved using tagList to include only those whose relator - * values are acceptable. Used for saving relators of authors separated by different - * types. - * - * @param record The record (fed in automatically) - * @param tagList The field specification to read - * @param acceptWithoutRelator Colon-delimited list of tags whose values should - * be accepted even if no relator subfield is defined - * @param relatorConfig The setting in author-classification.ini which - * defines which relator terms are acceptable (or a colon-delimited list) - * @param acceptUnknownRelators Colon-delimited list of tags whose relators - * should be indexed even if they are not listed in author-classification.ini. - * @return List result - */ -public List getRelatorsFilteredByRelator(Record record, String tagList, - String acceptWithoutRelator, String relatorConfig, - String acceptUnknownRelators -) { - // default firstOnly to false! - return getRelatorsFilteredByRelator( - record, tagList, acceptWithoutRelator, relatorConfig, - acceptUnknownRelators, "false", false - ); -} - -/** - * Filter values retrieved using tagList to include only those whose relator - * values are acceptable. Used for saving relators of authors separated by different - * types. - * - * @param record The record (fed in automatically) - * @param tagList The field specification to read - * @param acceptWithoutRelator Colon-delimited list of tags whose values should - * be accepted even if no relator subfield is defined - * @param relatorConfig The setting in author-classification.ini which - * defines which relator terms are acceptable (or a colon-delimited list) - * @return List result - */ -public List getRelatorsFilteredByRelator(Record record, String tagList, - String acceptWithoutRelator, String relatorConfig -) { - // default firstOnly to false! - return getRelatorsFilteredByRelator( - record, tagList, acceptWithoutRelator, relatorConfig, - acceptWithoutRelator, "false", false - ); -} - -/** - * This method fetches relator definitions from ini file and casts them to an - * array. If a colon-delimited string is passed in, this will be directly parsed - * instead of resorting to .ini loading. - * - * @param setting Setting to load from .ini or colon-delimited list. - * @return String[] - */ -public String[] loadRelatorConfig(String setting){ - StringBuilder relators = new StringBuilder(); - - // check for pipe-delimited string - String[] relatorSettings = setting.split("\\|"); - for (String relatorSetting: relatorSettings) { - // check for colon-delimited string - String[] relatorArray = relatorSetting.split(":"); - if (relatorArray.length > 1) { - for (int i = 0; i < relatorArray.length; i++) { - relators.append(relatorArray[i]).append(","); - } - } else { - relators.append(indexer.getConfigSetting( - "author-classification.ini", "AuthorRoles", relatorSetting - )).append(","); - } - } - - return relators.toString().split(","); -} - -/** - * Normalizes a relator string and returns a list containing the normalized - * relator plus any configured synonyms. - * - * @param relator Relator term to normalize - * @return List of strings - */ -public List normalizeRelatorAndAddSynonyms(String relator) -{ - List newList = new ArrayList(); - String normalized = normalizeRelatorString(relator); - newList.add(normalized); - String synonyms = indexer.getConfigSetting( - "author-classification.ini", "RelatorSynonyms", relator - ); - if (null != synonyms && synonyms.length() > 0) { - for (String synonym: synonyms.split("\\|")) { - String normalizedSynonym = normalizeRelatorString(synonym); - relatorSynonymLookup.put(normalizedSynonym, relator); - newList.add(normalizedSynonym); - } - } - return newList; -} - -/** - * Normalizes the strings in a list. - * - * @param stringList List of strings to be normalized - * @return Normalized List of strings - */ -public List normalizeRelatorStringList(List stringList) -{ - List newList = new ArrayList(); - for (String relator: stringList) { - newList.addAll(normalizeRelatorAndAddSynonyms(relator)); - } - return newList; -} - -/** - * Normalizes a string - * - * @param string String to be normalized - * @return string - */ -public String normalizeRelatorString(String string) -{ - return string - .trim() - .toLowerCase() - .replaceAll("\\p{Punct}+", ""); //POSIX character class Punctuation: One of !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ -} - -/** - * Filter values retrieved using tagList to include only those whose relator - * values are acceptable. Used for separating different types of authors. - * - * @param record The record (fed in automatically) - * @param tagList The field specification to read - * @param acceptWithoutRelator Colon-delimited list of tags whose values should - * be accepted even if no relator subfield is defined - * @param relatorConfig The setting in author-classification.ini which - * defines which relator terms are acceptable (or a colon-delimited list) - * @param acceptUnknownRelators Colon-delimited list of tags whose relators - * should be indexed even if they are not listed in author-classification.ini. - * @param indexRawRelators Set to "true" to index relators raw, as found - * in the MARC or "false" to index mapped versions. - * @return List result - */ -public List getAuthorInitialsFilteredByRelator(Record record, - String tagList, String acceptWithoutRelator, String relatorConfig, - String acceptUnknownRelators, String indexRawRelators -) { - List authors = getAuthorsFilteredByRelator( - record, tagList, acceptWithoutRelator, relatorConfig, - acceptUnknownRelators, indexRawRelators - ); - List result = new LinkedList(); - for (String author : authors) { - result.add(processInitials(author)); - } - return result; -} - -/** - * Filter values retrieved using tagList to include only those whose relator - * values are acceptable. Used for separating different types of authors. - * - * @param record The record (fed in automatically) - * @param tagList The field specification to read - * @param acceptWithoutRelator Colon-delimited list of tags whose values should - * be accepted even if no relator subfield is defined - * @param relatorConfig The setting in author-classification.ini which - * defines which relator terms are acceptable (or a colon-delimited list) - * @return List result - */ -public List getAuthorInitialsFilteredByRelator(Record record, - String tagList, String acceptWithoutRelator, String relatorConfig -) { - return getAuthorInitialsFilteredByRelator( - record, tagList, acceptWithoutRelator, relatorConfig, - acceptWithoutRelator, "false" - ); -} - -/** - * Filter values retrieved using tagList to include only those whose relator - * values are acceptable. Used for separating different types of authors. - * - * @param record The record (fed in automatically) - * @param tagList The field specification to read - * @param acceptWithoutRelator Colon-delimited list of tags whose values should - * be accepted even if no relator subfield is defined - * @param relatorConfig The setting in author-classification.ini which - * defines which relator terms are acceptable (or a colon-delimited list) - * @param acceptUnknownRelators Colon-delimited list of tags whose relators - * should be indexed even if they are not listed in author-classification.ini. - * @return List result - */ -public List getAuthorInitialsFilteredByRelator(Record record, - String tagList, String acceptWithoutRelator, String relatorConfig, - String acceptUnknownRelators -) { - return getAuthorInitialsFilteredByRelator( - record, tagList, acceptWithoutRelator, relatorConfig, - acceptUnknownRelators, "false" - ); -} - -/** - * Takes a name and cuts it into initials - * @param authorName e.g. Yeats, William Butler - * @return initials e.g. w b y wb - */ -public String processInitials(String authorName) { - Boolean isPersonalName = false; - // we guess that if there is a comma before the end - this is a personal name - if ((authorName.indexOf(',') > 0) - && (authorName.indexOf(',') < authorName.length()-1)) { - isPersonalName = true; - } - // get rid of non-alphabet chars but keep hyphens and accents - authorName = authorName.replaceAll("[^\\p{L} -]", "").toLowerCase(); - String[] names = authorName.split(" "); //split into tokens on spaces - // if this is a personal name we'll reorganise to put lastname at the end - String result = ""; - if (isPersonalName) { - String lastName = names[0]; - for (int i = 0; i < names.length-1; i++) { - names[i] = names[i+1]; - } - names[names.length-1] = lastName; - } - // put all the initials together in a space separated string - for (String name : names) { - if (name.length() > 0) { - String initial = name.substring(0,1); - // if there is a hyphenated name, use both initials - int pos = name.indexOf('-'); - if (pos > 0 && pos < name.length() - 1) { - String extra = name.substring(pos+1, pos+2); - initial = initial + " " + extra; - } - result += " " + initial; - } - } - // grab all initials and stick them together - String smushAll = result.replaceAll(" ", ""); - // if it's a long personal name, get all but the last initials as well - // e.g. wb for william butler yeats - if (names.length > 2 && isPersonalName) { - String smushPers = result.substring(0,result.length()-1).replaceAll(" ",""); - result = result + " " + smushPers; - } - // now we have initials separate and together - if (!result.trim().equals(smushAll)) { - result += " " + smushAll; - } - result = result.trim(); - return result; -} diff --git a/import/index_scripts/callnumber.bsh b/import/index_scripts/callnumber.bsh deleted file mode 100644 index fa87d608ffe9dedc1e86b48a938254832d08ce79..0000000000000000000000000000000000000000 --- a/import/index_scripts/callnumber.bsh +++ /dev/null @@ -1,142 +0,0 @@ -/** - * Custom call number script. - * - * This can be used to override built-in SolrMarc custom functions. If you change - * this script, you will need to activate it in import/marc_local.properties before - * it will be applied during indexing. - */ -import org.marc4j.marc.Record; -import org.solrmarc.callnum.LCCallNumber; -import org.solrmarc.index.SolrIndexer; - - -// define the base level indexer so that its methods can be called from the script. -// note that the SolrIndexer code will set this value before the script methods are called. -org.solrmarc.index.SolrIndexer indexer = null; - - /** - * Extract the full call number from a record, stripped of spaces - * @param record MARC record - * @return Call number label - * @deprecated Obsolete as of VuFind 2.4. - * This method exists only to support the VuFind call number search, version <= 2.3. - * As of VuFind 2.4, the munging for call number search in handled entirely in Solr. - */ - public String getFullCallNumber(Record record) { - - return(getFullCallNumber(record, "099ab:090ab:050ab")); - } - - /** - * Extract the full call number from a record, stripped of spaces - * @param record MARC record - * @param fieldSpec taglist for call number fields - * @return Call number label - * @deprecated Obsolete as of VuFind 2.4. - * This method exists only to support the VuFind call number search, version <= 2.3. - * As of VuFind 2.4, the munging for call number search in handled entirely in Solr. - */ - public String getFullCallNumber(Record record, String fieldSpec) { - - String val = SolrIndexer.getFirstFieldVal(record, fieldSpec); - - if (val != null) { - return val.toUpperCase().replaceAll(" ", ""); - } else { - return val; - } - } - - - /** - * Extract the call number label from a record - * @param record MARC record - * @return Call number label - */ - public String getCallNumberLabel(Record record) { - - return getCallNumberLabel(record, "090a:050a"); - } - - /** - * Extract the call number label from a record - * @param record MARC record - * @param fieldSpec taglist for call number fields - * @return Call number label - */ - public String getCallNumberLabel(Record record, String fieldSpec) { - - String val = SolrIndexer.getFirstFieldVal(record, fieldSpec); - - if (val != null) { - int dotPos = val.indexOf("."); - if (dotPos > 0) { - val = val.substring(0, dotPos); - } - return val.toUpperCase(); - } else { - return val; - } - } - - /** - * Extract the subject component of the call number - * - * Can return null - * - * @param record MARC record - * @return Call number subject letters - */ - public String getCallNumberSubject(Record record) { - - return(getCallNumberSubject(record, "090a:050a")); - } - - /** - * Extract the subject component of the call number - * - * Can return null - * - * @param record current MARC record - * @return Call number subject letters - */ - public String getCallNumberSubject(Record record, String fieldSpec) { - - String val = SolrIndexer.getFirstFieldVal(record, fieldSpec); - - if (val != null) { - String [] callNumberSubject = val.toUpperCase().split("[^A-Z]+"); - if (callNumberSubject.length > 0) - { - return callNumberSubject[0]; - } - } - return(null); - } - - /** - * Normalize a single LC call number - * @param record current MARC record - * @return String Normalized LCCN - */ - public String getFullCallNumberNormalized(Record record) { - - return(getFullCallNumberNormalized(record, "099ab:090ab:050ab")); - } - - /** - * Normalize a single LC call number - * @param record current MARC record - * @param fieldSpec which MARC fields / subfields need to be analyzed - * @return String Normalized LC call number - */ - public String getFullCallNumberNormalized(Record record, String fieldSpec) { - - // TODO: is the null fieldSpec still an issue? - if (fieldSpec != null) { - String cn = SolrIndexer.getFirstFieldVal(record, fieldSpec); - return (new LCCallNumber(cn)).getShelfKey(); - } - // If we got this far, we couldn't find a valid value: - return null; - } diff --git a/import/index_scripts/dewey.bsh b/import/index_scripts/dewey.bsh deleted file mode 100644 index 50edb41552ab3c45c591f290400b40cc7c909247..0000000000000000000000000000000000000000 --- a/import/index_scripts/dewey.bsh +++ /dev/null @@ -1,201 +0,0 @@ -/** - * Custom Dewey call number script. - * - * This can be used to override built-in SolrMarc custom functions. If you change - * this script, you will need to activate it in import/marc_local.properties before - * it will be applied during indexing. - */ -import org.marc4j.marc.Record; -import org.solrmarc.callnum.DeweyCallNumber; -import org.solrmarc.index.SolrIndexer; -import org.solrmarc.tools.CallNumUtils; - -// The beanshell script class will initialize this variable to pointer to the singleton SolrIndexer class. -SolrIndexer indexer; - -/** - * Extract a numeric portion of the Dewey decimal call number - * - * Can return null - * - * @param record current MARC record - * @param fieldSpec which MARC fields / subfields need to be analyzed - * @param precisionStr a decimal number (represented in string format) showing the - * desired precision of the returned number; i.e. 100 to round to nearest hundred, - * 10 to round to nearest ten, 0.1 to round to nearest tenth, etc. - * @return Set containing requested numeric portions of Dewey decimal call numbers - */ -public Set getDeweyNumber(Record record, String fieldSpec, String precisionStr) { - // Initialize our return value: - Set result = new LinkedHashSet(); - - // Precision comes in as a string, but we need to convert it to a float: - float precision = Float.parseFloat(precisionStr); - - // Loop through the specified MARC fields: - Set input = indexer.getFieldList(record, fieldSpec); - for (String current: input) { - DeweyCallNumber callNum = new DeweyCallNumber(current); - if (callNum.isValid()) { - // Convert the numeric portion of the call number into a float: - float currentVal = Float.parseFloat(callNum.getClassification()); - - // Round the call number value to the specified precision: - Float finalVal = new Float(Math.floor(currentVal / precision) * precision); - - // Convert the rounded value back to a string (with leading zeros) and save it: - // TODO: Provide different conversion to remove CallNumUtils dependency - result.add(CallNumUtils.normalizeFloat(finalVal.toString(), 3, -1)); - } - } - - // If we found no call number matches, return null; otherwise, return our results: - if (result.isEmpty()) - return null; - return result; -} - -/** - * Normalize Dewey numbers for searching purposes (uppercase/stripped spaces) - * - * Can return null - * - * @param record current MARC record - * @param fieldSpec which MARC fields / subfields need to be analyzed - * @return Set containing normalized Dewey numbers extracted from specified fields. - */ -public Set getDeweySearchable(Record record, String fieldSpec) { - // Initialize our return value: - Set result = new LinkedHashSet(); - - // Loop through the specified MARC fields: - Set input = indexer.getFieldList(record, fieldSpec); - Iterator iter = input.iterator(); - while (iter.hasNext()) { - // Get the current string to work on: - String current = iter.next(); - - // Add valid strings to the set, normalizing them to be all uppercase - // and free from whitespace. - DeweyCallNumber callNum = new DeweyCallNumber(current); - if (callNum.isValid()) { - result.add(callNum.toString().toUpperCase().replaceAll(" ", "")); - } - } - - // If we found no call numbers, return null; otherwise, return our results: - if (result.isEmpty()) - return null; - return result; -} - -/** - * Normalize Dewey numbers for sorting purposes (use only the first valid number!) - * - * Can return null - * - * @param record current MARC record - * @param fieldSpec which MARC fields / subfields need to be analyzed - * @return String containing the first valid Dewey number encountered, normalized - * for sorting purposes. - */ -public String getDeweySortable(Record record, String fieldSpec) { - // Loop through the specified MARC fields: - Set input = indexer.getFieldList(record, fieldSpec); - Iterator iter = input.iterator(); - while (iter.hasNext()) { - // Get the current string to work on: - String current = iter.next(); - - // If this is a valid Dewey number, return the sortable shelf key: - DeweyCallNumber callNum = new DeweyCallNumber(current); - if (callNum.isValid()) { - return callNum.getShelfKey(); - } - } - - // If we made it this far, we didn't find a valid sortable Dewey number: - return null; -} - -/** - * Get sort key for first Dewey call number, identified by call type. - * - * <p>{@code fieldSpec} is of form {@literal 098abc:099ab}, does not accept subfield ranges. - * - * - * @param record current MARC record - * @param fieldSpec which MARC fields / subfields need to be analyzed - * @param callTypeSf subfield containing call number type, single character only - * @param callType literal call number code - * @return sort key for first identified Dewey call number - */ -public static String getDeweySortableByType( - Record record, String fieldSpec, String callTypeSf, String callType) { - String sortKey = null; - for (String tag : fieldSpec.split(":")) { - // Check to ensure tag length is at least 3 characters - if (tag.length() < 3) { - //TODO: Should this go to a log? Better message for a bad tag in a field spec? - System.err.println("Invalid tag specified: " + tag); - continue; - } - String dfTag = tag.substring(0, 3); - String sfSpec = null; - if (tag.length() > 3) { - sfSpec = tag.substring(3); - } - - // do all fields for this tag - for (VariableField vf : record.getVariableFields(dfTag)) { - // Assume tag represents a DataField - DataField df = (DataField) vf; - boolean callTypeMatch = false; - - // Assume call type subfield could repeat - for (Subfield typeSf : df.getSubfields(callTypeSf)) { - if (callTypeSf.indexOf(typeSf.getCode()) != -1 && typeSf.getData().equals(callType)) { - callTypeMatch = true; - } - } - // take the first call number coded as Dewey - if (callTypeMatch) { - sortKey = new DeweyCallNumber(df.getSubfieldsAsString(sfSpec)).getShelfKey(); - break; - } - } // end loop over variable fields - } // end loop over fieldSpec - return sortKey; -} - - -/** - * Normalize Dewey numbers for AlphaBrowse sorting purposes (use all numbers!) - * - * Can return null - * - * @param record current MARC record - * @param fieldSpec which MARC fields / subfields need to be analyzed - * @return List containing normalized Dewey numbers extracted from specified fields. - */ -public List getDeweySortables(Record record, String fieldSpec) { - // Initialize our return value: - List result = new LinkedList(); - - // Loop through the specified MARC fields: - Set input = indexer.getFieldList(record, fieldSpec); - Iterator iter = input.iterator(); - while (iter.hasNext()) { - // Get the current string to work on: - String current = iter.next(); - - // gather all sort keys, even if number is not valid - DeweyCallNumber callNum = new DeweyCallNumber(current); - result.add(callNum.getShelfKey()); - } - - // If we found no call numbers, return null; otherwise, return our results: - if (result.isEmpty()) - return null; - return result; -} diff --git a/import/index_scripts/format.bsh b/import/index_scripts/format.bsh deleted file mode 100644 index 863e167722c1f29e73bb1c67495553bc2b1f4e78..0000000000000000000000000000000000000000 --- a/import/index_scripts/format.bsh +++ /dev/null @@ -1,302 +0,0 @@ -/** - * Custom format determination script. - * - * This can be used to override built-in SolrMarc custom functions. If you change - * this script, you will need to activate it in import/marc_local.properties before - * it will be applied during indexing. - */ -import org.marc4j.marc.Record; -import org.marc4j.marc.ControlField; -import org.marc4j.marc.DataField; - -/** - * Determine Record Format(s) - * - * @param Record record - * @return Set format of record - */ -public Set getFormat(Record record){ - Set result = new LinkedHashSet(); - String leader = record.getLeader().toString(); - char leaderBit; - ControlField fixedField = (ControlField) record.getVariableField("008"); - DataField title = (DataField) record.getVariableField("245"); - String formatString; - char formatCode = ' '; - char formatCode2 = ' '; - char formatCode4 = ' '; - - // check if there's an h in the 245 - if (title != null) { - if (title.getSubfield('h') != null){ - if (title.getSubfield('h').getData().toLowerCase().contains("[electronic resource]")) { - result.add("Electronic"); - return result; - } - } - } - - // check the 007 - this is a repeating field - List fields = record.getVariableFields("007"); - Iterator fieldsIter = fields.iterator(); - if (fields != null) { - ControlField formatField; - while(fieldsIter.hasNext()) { - formatField = (ControlField) fieldsIter.next(); - formatString = formatField.getData().toUpperCase(); - formatCode = formatString.length() > 0 ? formatString.charAt(0) : ' '; - formatCode2 = formatString.length() > 1 ? formatString.charAt(1) : ' '; - formatCode4 = formatString.length() > 4 ? formatString.charAt(4) : ' '; - switch (formatCode) { - case 'A': - switch(formatCode2) { - case 'D': - result.add("Atlas"); - break; - default: - result.add("Map"); - break; - } - break; - case 'C': - switch(formatCode2) { - case 'A': - result.add("TapeCartridge"); - break; - case 'B': - result.add("ChipCartridge"); - break; - case 'C': - result.add("DiscCartridge"); - break; - case 'F': - result.add("TapeCassette"); - break; - case 'H': - result.add("TapeReel"); - break; - case 'J': - result.add("FloppyDisk"); - break; - case 'M': - case 'O': - result.add("CDROM"); - break; - case 'R': - // Do not return - this will cause anything with an - // 856 field to be labeled as "Electronic" - break; - default: - result.add("Software"); - break; - } - break; - case 'D': - result.add("Globe"); - break; - case 'F': - result.add("Braille"); - break; - case 'G': - switch(formatCode2) { - case 'C': - case 'D': - result.add("Filmstrip"); - break; - case 'T': - result.add("Transparency"); - break; - default: - result.add("Slide"); - break; - } - break; - case 'H': - result.add("Microfilm"); - break; - case 'K': - switch(formatCode2) { - case 'C': - result.add("Collage"); - break; - case 'D': - result.add("Drawing"); - break; - case 'E': - result.add("Painting"); - break; - case 'F': - result.add("Print"); - break; - case 'G': - result.add("Photonegative"); - break; - case 'J': - result.add("Print"); - break; - case 'L': - result.add("Drawing"); - break; - case 'O': - result.add("FlashCard"); - break; - case 'N': - result.add("Chart"); - break; - default: - result.add("Photo"); - break; - } - break; - case 'M': - switch(formatCode2) { - case 'F': - result.add("VideoCassette"); - break; - case 'R': - result.add("Filmstrip"); - break; - default: - result.add("MotionPicture"); - break; - } - break; - case 'O': - result.add("Kit"); - break; - case 'Q': - result.add("MusicalScore"); - break; - case 'R': - result.add("SensorImage"); - break; - case 'S': - switch(formatCode2) { - case 'D': - result.add("SoundDisc"); - break; - case 'S': - result.add("SoundCassette"); - break; - default: - result.add("SoundRecording"); - break; - } - break; - case 'V': - switch(formatCode2) { - case 'C': - result.add("VideoCartridge"); - break; - case 'D': - switch(formatCode4) { - case 'S': - result.add("BRDisc"); - break; - case 'V': - default: - result.add("VideoDisc"); - break; - } - break; - case 'F': - result.add("VideoCassette"); - break; - case 'R': - result.add("VideoReel"); - break; - default: - result.add("Video"); - break; - } - break; - } - } - if (!result.isEmpty()) { - return result; - } - } - - // check the Leader at position 6 - leaderBit = leader.charAt(6); - switch (Character.toUpperCase(leaderBit)) { - case 'C': - case 'D': - result.add("MusicalScore"); - break; - case 'E': - case 'F': - result.add("Map"); - break; - case 'G': - result.add("Slide"); - break; - case 'I': - result.add("SoundRecording"); - break; - case 'J': - result.add("MusicRecording"); - break; - case 'K': - result.add("Photo"); - break; - case 'M': - result.add("Electronic"); - break; - case 'O': - case 'P': - result.add("Kit"); - break; - case 'R': - result.add("PhysicalObject"); - break; - case 'T': - result.add("Manuscript"); - break; - } - if (!result.isEmpty()) { - return result; - } - - // check the Leader at position 7 - leaderBit = leader.charAt(7); - switch (Character.toUpperCase(leaderBit)) { - // Monograph - case 'M': - if (formatCode == 'C') { - result.add("eBook"); - } else { - result.add("Book"); - } - break; - // Component parts - case 'A': - result.add("BookComponentPart"); - break; - case 'B': - result.add("SerialComponentPart"); - break; - // Serial - case 'S': - // Look in 008 to determine what type of Continuing Resource - formatCode = fixedField.getData().toUpperCase().charAt(21); - switch (formatCode) { - case 'N': - result.add("Newspaper"); - break; - case 'P': - result.add("Journal"); - break; - default: - result.add("Serial"); - break; - } - } - - // Nothing worked! - if (result.isEmpty()) { - result.add("Unknown"); - } - - return result; -} diff --git a/import/index_scripts/getFirstNormalizedLCCN.bsh b/import/index_scripts/getFirstNormalizedLCCN.bsh deleted file mode 100644 index e4b00be6ae5cb5f67d1c116f254e514105511ecc..0000000000000000000000000000000000000000 --- a/import/index_scripts/getFirstNormalizedLCCN.bsh +++ /dev/null @@ -1,75 +0,0 @@ -import org.marc4j.marc.Record; - -// Give ourselves the ability to import other BeanShell scripts -String vufindHome = System.getenv("VUFIND_HOME"); -String vufindLocal = System.getenv("VUFIND_LOCAL_DIR"); -addClassPath(vufindHome + "/import"); -if (vufindLocal != null) { - addClassPath(vufindLocal + "/import"); -} -importCommands("index_scripts"); - -// define the base level indexer so that its methods can be called from the script. -// note that the SolrIndexer code will set this value before the script methods are called. -org.solrmarc.index.SolrIndexer indexer = null; - -/** - * Extract the first valid LCCN from a record and return it in a normalized format - * with an optional prefix added (helpful for guaranteeing unique IDs) - * @param indexer - * @param record - * @param fieldSpec - * @param prefix - * @return Normalized LCCN - */ -public String getFirstNormalizedLCCN(org.solrmarc.index.SolrIndexer indexer, - Record record, String fieldSpec, String prefix) { - // Loop through relevant fields in search of first valid LCCN: - Set lccns = indexer.getFieldList(record, fieldSpec); - Iterator lccnIter = lccns.iterator(); - if (lccns != null) { - String current; - while(lccnIter.hasNext()) { - current = getNormalizedLCCN(lccnIter.next()); - if (current != null && current != false && current.length() > 0) { - return prefix + current; - } - } - } - - // If we got this far, we couldn't find a valid value: - return null; -} - -/** - * Extract the first valid LCCN from a record and return it in a normalized format - * with an optional prefix added (helpful for guaranteeing unique IDs) - * @param record - * @param fieldSpec - * @param prefix - * @return Normalized LCCN - */ -public String getFirstNormalizedLCCN(Record record, String fieldSpec, String prefix) { - return getFirstNormalizedLCCN(indexer, record, fieldSpec, prefix); -} - -/** - * Extract the first valid LCCN from a record and return it in a normalized format - * @param record - * @param fieldSpec - * @return Normalized LCCN - */ -public String getFirstNormalizedLCCN(Record record, String fieldSpec) { - // Send in a default prefix if none was provided by the user: - return getFirstNormalizedLCCN(indexer, record, fieldSpec, ""); -} - -/** - * Extract the first valid LCCN from a record and return it in a normalized format - * @param record - * @return Normalized LCCN - */ -public String getFirstNormalizedLCCN(Record record) { - // Send in a default fieldSpec/prefix if none were provided by the user: - return getFirstNormalizedLCCN(indexer, record, "010a", ""); -} diff --git a/import/index_scripts/getFulltext.bsh b/import/index_scripts/getFulltext.bsh deleted file mode 100644 index 6385c8640b4a21ef68add7ee431ae1f51bcedb91..0000000000000000000000000000000000000000 --- a/import/index_scripts/getFulltext.bsh +++ /dev/null @@ -1,260 +0,0 @@ -/** - * Custom full text retrieval script. - * - * This can be used to override built-in SolrMarc custom functions. If you change - * this script, you will need to activate it in import/marc_local.properties before - * it will be applied during indexing. - */ -import org.marc4j.marc.Record; -import org.marc4j.marc.DataField; -import java.util.regex.Pattern; -import java.io.*; -import javax.xml.parsers.DocumentBuilder; -import javax.xml.parsers.DocumentBuilderFactory; -import org.w3c.dom.Document; -import org.w3c.dom.Element; -import org.w3c.dom.Node; -import org.w3c.dom.NodeList; - -// define the base level indexer so that its methods can be called from the script. -// note that the SolrIndexer code will set this value before the script methods are called. -org.solrmarc.index.SolrIndexer indexer = null; - -/** - * Load configurations for the full text parser. Return an array containing the - * parser type in the first element and the parser configuration in the second - * element. - * - * @return String[] - */ -public String[] getFulltextParserSettings() -{ - String parserType = indexer.getConfigSetting( - "fulltext.ini", "General", "parser" - ); - if (null != parserType) { - parserType = parserType.toLowerCase(); - } - - // Is Aperture active? - String aperturePath = indexer.getConfigSetting( - "fulltext.ini", "Aperture", "webcrawler" - ); - if ((null == parserType && null != aperturePath) - || (null != parserType && parserType.equals("aperture")) - ) { - String[] array = { "aperture", aperturePath }; - return array; - } - - // Is Tika active? - String tikaPath = indexer.getConfigSetting( - "fulltext.ini", "Tika", "path" - ); - if ((null == parserType && null != tikaPath) - || (null != parserType && parserType.equals("tika")) - ) { - String[] array = { "tika", tikaPath }; - return array; - } - - // No recognized parser found: - String[] array = { "none", null }; - return array; -} - -/** - * Extract full-text from the documents referenced in the tags - * - * @param Record record - * @param String field spec to search for URLs - * @param String only harvest files matching this extension (null for all) - * @return String The full-text - */ -public String getFulltext(Record record, String fieldSpec, String extension) { - String result = ""; - - // Get the web crawler settings (and return no text if it is unavailable) - String[] parserSettings = getFulltextParserSettings(); - if (parserSettings[0].equals("none")) { - return null; - } - - // Loop through the specified MARC fields: - Set fields = indexer.getFieldList(record, fieldSpec); - Iterator fieldsIter = fields.iterator(); - if (fields != null) { - while(fieldsIter.hasNext()) { - // Get the current string to work on (and sanitize spaces): - String current = fieldsIter.next().replaceAll(" ", "%20"); - // Filter by file extension - if (extension == null || current.endsWith(extension)) { - // Load the parser output for each tag into a string - result = result + harvestWithParser(current, parserSettings); - } - } - } - // return string to SolrMarc - return result; -} - -/** - * Extract full-text from the documents referenced in the tags - * - * @param Record record - * @param String field spec to search for URLs - * @return String The full-text - */ -public String getFulltext(Record record, String fieldSpec) { - return getFulltext(record, fieldSpec, null); -} - -/** - * Extract full-text from the documents referenced in the tags - * - * @param Record record - * @return String The full-text - */ -public String getFulltext(Record record) { - return getFulltext(record, "856u", null); -} - -/** - * Clean up XML data generated by Aperture - * - * @param File The file to clean - * @return File A fixed version of the file - */ -public File sanitizeApertureOutput(File f) -{ - //clean up the aperture xml output - File tempFile = File.createTempFile("buffer", ".tmp"); - FileOutputStream fw = new FileOutputStream(tempFile); - Writer writer = new OutputStreamWriter(fw, "UTF8"); - - //delete this control character from the File and save - Reader fr = new FileReader(f); - BufferedReader br = new BufferedReader(fr); - while (br.ready()) { - writer.write(sanitizeFullText(br.readLine())); - } - writer.close(); - br.close(); - fr.close(); - - return tempFile; -} - -/** - * Clean up bad characters in the full text. - * - * @param String Text to clean - * @return String Cleaned text - */ -public String sanitizeFullText(text) -{ - String badChars = "[^\\u0009\\u000A\\u000D\\u0020-\\uD7FF\\uE000-\\uFFFD\\u10000-\\u10FFFF]+"; - return text.replaceAll(badChars, " "); -} - -/** - * Harvest the contents of a document file (PDF, Word, etc.) using Aperture. - * This method will only work if Aperture is properly configured in the - * fulltext.ini file. Without proper configuration, this will simply return an - * empty string. - * - * @param String The url extracted from the MARC tag. - * @param String The path to Aperture - * @return String The full-text - */ -public String harvestWithAperture(url, aperturePath) { - String plainText = ""; - // Create temp file. - File f = File.createTempFile("apt", ".txt"); - - // Delete temp file when program exits. - f.deleteOnExit(); - - // Construct the command to call Aperture - String cmd = aperturePath + " -o " + f.getAbsolutePath().toString() + " -x " + url; - - // Call Aperture - //System.out.println("Loading fulltext from " + url + ". Please wait ..."); - Process p = Runtime.getRuntime().exec(cmd); - BufferedReader stdInput = new BufferedReader(new - InputStreamReader(p.getInputStream())); - while ((s = stdInput.readLine()) != null) { - //System.out.println(s); - } - // Wait for Aperture to finish - p.waitFor(); - - // Parse Aperture XML output - try { - DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); - DocumentBuilder db = dbf.newDocumentBuilder(); - File tempFile = sanitizeApertureOutput(f); - Document xmlDoc = db.parse(tempFile); - NodeList nl = xmlDoc.getElementsByTagName("plainTextContent"); - if(nl != null && nl.getLength() > 0) { - Node node = nl.item(0); - if (node.getNodeType() == Node.ELEMENT_NODE) { - plainText = plainText + node.getTextContent(); - } - } - - // we'll hold onto the temp file if it failed to parse for debugging; - // only set it up to be deleted if we've made it this far successfully. - tempFile.deleteOnExit(); - } catch (Exception e) { - indexer.getLogger().error("Error encountered parsing XML Document: " + e); - } - - return plainText; -} - -/** - * Harvest the contents of a document file (PDF, Word, etc.) using Tika. - * This method will only work if Tika is properly configured in the fulltext.ini - * file. Without proper configuration, this will simply return an empty string. - * - * @param String The url extracted from the MARC tag. - * @param String The path to Tika - * @return String The full-text - */ -public String harvestWithTika(url, scraperPath) { - String plainText = ""; - - // Construct the command - String cmd = "java -jar " + scraperPath + " -t -eUTF8 " + url; - - // Call our scraper - //System.out.println("Loading fulltext from " + url + ". Please wait ..."); - Process p = Runtime.getRuntime().exec(cmd); - BufferedReader stdInput = new BufferedReader(new - InputStreamReader(p.getInputStream(), "UTF8")); - - // We'll build the string from the command output - StringBuilder stringBuilder= new StringBuilder(); - while ((s = stdInput.readLine()) != null) { - stringBuilder.append(s); - } - - return sanitizeFullText(stringBuilder.toString()); -} - -/** - * Harvest the contents of a document file (PDF, Word, etc.) using the active parser. - * - * @param String The url extracted from the MARC tag. - * @param String[] Configuration settings from getFulltextParserSettings. - * @return String The full-text - */ -public String harvestWithParser(url, settings) { - if (settings[0].equals("aperture")) { - return harvestWithAperture(url, settings[1]); - } else if (settings[0].equals("tika")) { - return harvestWithTika(url, settings[1]); - } - return null; -} diff --git a/import/index_scripts/getNormalizedLCCN.bsh b/import/index_scripts/getNormalizedLCCN.bsh deleted file mode 100644 index 8f5d9182e7780d79ec983607604f176c639b3234..0000000000000000000000000000000000000000 --- a/import/index_scripts/getNormalizedLCCN.bsh +++ /dev/null @@ -1,27 +0,0 @@ -/** - * Normalize a single LCCN using the procedure specified at: - * http://www.loc.gov/marc/lccn-namespace.html#normalization - * @param lccn - * @return Normalized LCCN - */ -public String getNormalizedLCCN(String lccn) { - // Remove whitespace: - lccn = lccn.replaceAll(" ", ""); - - // Chop off anything following a forward slash: - String[] parts = lccn.split("/", 2); - lccn = parts[0]; - - // Normalize any characters following a hyphen to at least six digits: - parts = lccn.split("-", 2); - if (parts.length > 1) { - String secondPart = parts[1]; - while (secondPart.length() < 6) { - secondPart = "0" + secondPart; - } - lccn = parts[0] + secondPart; - } - - // Send back normalized LCCN: - return lccn; -} diff --git a/import/index_scripts/getNormalizedLCCNs.bsh b/import/index_scripts/getNormalizedLCCNs.bsh deleted file mode 100644 index b5ca263e5550d1379737f2f59e25511e07d5c945..0000000000000000000000000000000000000000 --- a/import/index_scripts/getNormalizedLCCNs.bsh +++ /dev/null @@ -1,51 +0,0 @@ -import org.marc4j.marc.Record; - -// Give ourselves the ability to import other BeanShell scripts -String vufindHome = System.getenv("VUFIND_HOME"); -String vufindLocal = System.getenv("VUFIND_LOCAL_DIR"); -addClassPath(vufindHome + "/import"); -if (vufindLocal != null) { - addClassPath(vufindLocal + "/import"); -} -importCommands("index_scripts"); - -// define the base level indexer so that its methods can be called from the script. -// note that the SolrIndexer code will set this value before the script methods are called. -org.solrmarc.index.SolrIndexer indexer = null; - -/** - * Extract LCCNs from a record and return them in a normalized format - * @param record - * @param fieldSpec - * @return Set of normalized LCCNs - */ -public Set getNormalizedLCCNs(Record record, String fieldSpec) { - // Initialize return value: - Set result = new LinkedHashSet(); - - // Loop through relevant fields and normalize everything: - Set lccns = indexer.getFieldList(record, fieldSpec); - Iterator lccnIter = lccns.iterator(); - if (lccns != null) { - String current; - while(lccnIter.hasNext()) { - current = getNormalizedLCCN(lccnIter.next()); - if (current != null && current != false && current.length() > 0) { - result.add(current); - } - } - } - - // Send back results: - return result; -} - -/** - * Extract LCCNs from a record and return them in a normalized format - * @param record - * @return Set of normalized LCCNs - */ -public Set getNormalizedLCCNs(Record record) { - // Send in a default fieldSpec if none was provided by the user: - return getNormalizedLCCNs(record, "010a"); -} diff --git a/import/index_scripts/getdate.bsh b/import/index_scripts/getdate.bsh deleted file mode 100644 index 0a8186b4d5bc8414c0bfbe6f478d772cefa20b87..0000000000000000000000000000000000000000 --- a/import/index_scripts/getdate.bsh +++ /dev/null @@ -1,198 +0,0 @@ -/** - * Custom date script. - * - * This can be used to override built-in SolrMarc custom functions. If you change - * this script, you will need to activate it in import/marc_local.properties before - * it will be applied during indexing. - */ -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import org.apache.log4j.Logger; -import org.marc4j.marc.*; -import org.solrmarc.marc.MarcImporter; -import org.solrmarc.tools.Utils; -import org.solrmarc.index.SolrIndexer; - -private final static Pattern FOUR_DIGIT_PATTERN_BRACES = Pattern.compile("\\[[12]\\d{3,3}\\]"); -private final static Pattern FOUR_DIGIT_PATTERN_ONE_BRACE = Pattern.compile("\\[[12]\\d{3,3}"); -private final static Pattern FOUR_DIGIT_PATTERN_STARTING_WITH_1_2 = Pattern.compile("(20|19|18|17|16|15)[0-9][0-9]"); -private final static Pattern FOUR_DIGIT_PATTERN_OTHER_1 = Pattern.compile("l\\d{3,3}"); -private final static Pattern FOUR_DIGIT_PATTERN_OTHER_2 = Pattern.compile("\\[19\\]\\d{2,2}"); -private final static Pattern FOUR_DIGIT_PATTERN_OTHER_3 = Pattern.compile("(20|19|18|17|16|15)[0-9][-?0-9]"); -private final static Pattern FOUR_DIGIT_PATTERN_OTHER_4 = Pattern.compile("i.e. (20|19|18|17|16|15)[0-9][0-9]"); -private final static Pattern BC_DATE_PATTERN = Pattern.compile("[0-9]+ [Bb][.]?[Cc][.]?"); -private final static Pattern FOUR_DIGIT_PATTERN = Pattern.compile("\\d{4,4}"); -private static Matcher matcher; -private static Matcher matcher_braces; -private static Matcher matcher_one_brace; -private static Matcher matcher_start_with_1_2; -private static Matcher matcher_l_plus_three_digits; -private static Matcher matcher_bracket_19_plus_two_digits; -private static Matcher matcher_ie_date; -private static Matcher matcher_bc_date; -private static Matcher matcher_three_digits_plus_unk; -protected static Logger logger = Logger.getLogger(Utils.class.getName()); - -/** - * Cleans non-digits from a String - * @param date String to parse - * @return Numeric part of date String (or null) - */ -public String cleanDate(String date) -{ - matcher_braces = FOUR_DIGIT_PATTERN_BRACES.matcher(date); - matcher_one_brace = FOUR_DIGIT_PATTERN_ONE_BRACE.matcher(date); - matcher_start_with_1_2 = FOUR_DIGIT_PATTERN_STARTING_WITH_1_2.matcher(date); - matcher_l_plus_three_digits = FOUR_DIGIT_PATTERN_OTHER_1.matcher(date); - matcher_bracket_19_plus_two_digits = FOUR_DIGIT_PATTERN_OTHER_2.matcher(date); - matcher_three_digits_plus_unk = FOUR_DIGIT_PATTERN_OTHER_3.matcher(date); - matcher_ie_date = FOUR_DIGIT_PATTERN_OTHER_4.matcher(date); - matcher = FOUR_DIGIT_PATTERN.matcher(date); - matcher_bc_date = BC_DATE_PATTERN.matcher(date); - - String cleanDate = null; // raises DD-anomaly - - if(matcher_braces.find()) - { - cleanDate = matcher_braces.group(); - cleanDate = Utils.removeOuterBrackets(cleanDate); - if (matcher.find()) - { - String tmp = matcher.group(); - if (!tmp.equals(cleanDate)) - { - tmp = "" + tmp; - } - } - } - else if (matcher_ie_date.find()) - { - cleanDate = matcher_ie_date.group().replaceAll("i.e. ", ""); - } - else if(matcher_one_brace.find()) - { - cleanDate = matcher_one_brace.group(); - cleanDate = Utils.removeOuterBrackets(cleanDate); - if (matcher.find()) - { - String tmp = matcher.group(); - if (!tmp.equals(cleanDate)) - { - tmp = "" + tmp; - } - } - } - else if(matcher_bc_date.find()) - { - cleanDate = null; - } - else if(matcher_start_with_1_2.find()) - { - cleanDate = matcher_start_with_1_2.group(); - } - else if(matcher_l_plus_three_digits.find()) - { - cleanDate = matcher_l_plus_three_digits.group().replaceAll("l", "1"); - } - else if(matcher_bracket_19_plus_two_digits.find()) - { - cleanDate = matcher_bracket_19_plus_two_digits.group().replaceAll("\\[", "").replaceAll("\\]", ""); - } - else if(matcher_three_digits_plus_unk.find()) - { - cleanDate = matcher_three_digits_plus_unk.group().replaceAll("[-?]", "0"); - } - if (cleanDate != null) - { - logger.debug("Date : "+ date + " mapped to : "+ cleanDate); - } - else - { - logger.debug("No Date match: "+ date); - } - return cleanDate; -} - -/** - * Return the date in 260c as a string - * @param record - the marc record object - * @return 260c, "cleaned" per org.solrmarc.tools.Utils.cleanDate() - */ -public String getDate(Record record) -{ - Set result = SolrIndexer.getFieldList(record, "260c"); - String date = org.solrmarc.tools.Utils.join(result, ", "); - if (date == null || date.length() == 0) - return (null); - return cleanDate(date); -} - -/** - * Get all available dates from the record. - * - * @param Record record - * @return Set dates - */ -public Set getDates(Record record) { - Set dates = new LinkedHashSet(); - - // First check old-style 260c date: - List list260 = record.getVariableFields("260"); - for (VariableField vf : list260) { - DataField df = (DataField) vf; - List currentDates = df.getSubfields('c'); - for (Subfield sf : currentDates) { - String currentDateStr = cleanDate(sf.getData()); - dates.add(currentDateStr); - } - } - // Now track down relevant RDA-style 264c dates; we only care about - // copyright and publication dates (and ignore copyright dates if - // publication dates are present). - Set pubDates = new LinkedHashSet(); - Set copyDates = new LinkedHashSet(); - List list264 = record.getVariableFields("264"); - for (VariableField vf : list264) { - DataField df = (DataField) vf; - List currentDates = df.getSubfields('c'); - for (Subfield sf : currentDates) { - String currentDateStr = cleanDate(sf.getData()); - char ind2 = df.getIndicator2(); - switch (ind2) - { - case '1': - pubDates.add(currentDateStr); - break; - case '4': - copyDates.add(currentDateStr); - break; - } - } - } - if (pubDates.size() > 0) { - dates.addAll(pubDates); - } else if (copyDates.size() > 0) { - dates.addAll(copyDates); - } - - return dates; -} - -/** - * Get the earliest publication date from the record. - * - * @param Record record - * @return String earliest date - */ -public String getFirstDate(Record record) { - String result = null; - Set dates = getDates(record); - Iterator datesIter = dates.iterator(); - while (datesIter.hasNext()) { - String current = datesIter.next(); - if (result == null || Integer.parseInt(current) < Integer.parseInt(result)) { - result = current; - } - } - return result; -} \ No newline at end of file diff --git a/import/index_scripts/getpublishers.bsh b/import/index_scripts/getpublishers.bsh deleted file mode 100644 index 465909590cc0c19f044c0486796eb47c3f0fa1eb..0000000000000000000000000000000000000000 --- a/import/index_scripts/getpublishers.bsh +++ /dev/null @@ -1,66 +0,0 @@ -/** - * Custom date script. - * - * This can be used to override built-in SolrMarc custom functions. If you change - * this script, you will need to activate it in import/marc_local.properties before - * it will be applied during indexing. - */ -import org.marc4j.marc.*; - -/** - * Get all available publishers from the record. - * - * @param Record record - * @return Set publishers - */ -public Set getPublishers(Record record) { - Set publishers = new LinkedHashSet(); - - // First check old-style 260b name: - List list260 = record.getVariableFields("260"); - for (VariableField vf : list260) - { - DataField df = (DataField) vf; - String currentString = ""; - for (Subfield current : df.getSubfields('b')) { - currentString = currentString.trim().concat(" " + current.getData()).trim(); - } - if (currentString.length() > 0) { - publishers.add(currentString); - } - } - - // Now track down relevant RDA-style 264b names; we only care about - // copyright and publication names (and ignore copyright names if - // publication names are present). - Set pubNames = new LinkedHashSet(); - Set copyNames = new LinkedHashSet(); - List list264 = record.getVariableFields("264"); - for (VariableField vf : list264) - { - DataField df = (DataField) vf; - String currentString = ""; - for (Subfield current : df.getSubfields('b')) { - currentString = currentString.trim().concat(" " + current.getData()).trim(); - } - if (currentString.length() > 0) { - char ind2 = df.getIndicator2(); - switch (ind2) - { - case '1': - pubNames.add(currentString); - break; - case '4': - copyNames.add(currentString); - break; - } - } - } - if (pubNames.size() > 0) { - publishers.addAll(pubNames); - } else if (copyNames.size() > 0) { - publishers.addAll(copyNames); - } - - return publishers; -} \ No newline at end of file diff --git a/import/index_scripts/illustrated.bsh b/import/index_scripts/illustrated.bsh deleted file mode 100644 index 3733630e7a4c00e9f40e51703b02931e183fcaa3..0000000000000000000000000000000000000000 --- a/import/index_scripts/illustrated.bsh +++ /dev/null @@ -1,86 +0,0 @@ -/** - * Custom isIllustrated() script. - * - * This can be used to override built-in SolrMarc custom functions. If you change - * this script, you will need to activate it in import/marc_local.properties before - * it will be applied during indexing. - */ -import org.marc4j.marc.Record; -import org.marc4j.marc.ControlField; -import org.marc4j.marc.DataField; - -/** - * Determine if a record is illustrated. - * - * @param Record record - * @return String "Illustrated" or "Not Illustrated" - */ -public String isIllustrated(Record record) { - String leader = record.getLeader().toString(); - - // Does the leader indicate this is a "language material" that might have extra - // illustration details in the fixed fields? - if (leader.charAt(6) == 'a') { - String currentCode = ""; // for use in loops below - - // List of 008/18-21 codes that indicate illustrations: - String illusCodes = "abcdefghijklmop"; - - // Check the illustration characters of the 008: - ControlField fixedField = (ControlField) record.getVariableField("008"); - if (fixedField != null) { - String fixedFieldText = fixedField.getData().toLowerCase(); - for (int i = 18; i <= 21; i++) { - if (i < fixedFieldText.length()) { - currentCode = fixedFieldText.substring(i, i + 1); - if (illusCodes.contains(currentCode)) { - return "Illustrated"; - } - } - } - } - - // Now check if any 006 fields apply: - List fields = record.getVariableFields("006"); - Iterator fieldsIter = fields.iterator(); - if (fields != null) { - ControlField formatField; - while(fieldsIter.hasNext()) { - fixedField = (ControlField) fieldsIter.next(); - String fixedFieldText = fixedField.getData().toLowerCase(); - for (int i = 1; i <= 4; i++) { - if (i < fixedFieldText.length()) { - currentCode = fixedFieldText.substring(i, i + 1); - if (illusCodes.contains(currentCode)) { - return "Illustrated"; - } - } - } - } - } - } - - // Now check for interesting strings in 300 subfield b: - List fields = record.getVariableFields("300"); - Iterator fieldsIter = fields.iterator(); - if (fields != null) { - DataField physical; - while(fieldsIter.hasNext()) { - physical = (DataField) fieldsIter.next(); - List subfields = physical.getSubfields('b'); - Iterator subfieldsIter = subfields.iterator(); - if (subfields != null) { - String desc; - while (subfieldsIter.hasNext()) { - desc = subfieldsIter.next().getData().toLowerCase(); - if (desc.contains("ill.") || desc.contains("illus.")) { - return "Illustrated"; - } - } - } - } - } - - // If we made it this far, we found no sign of illustrations: - return "Not Illustrated"; -} diff --git a/import/index_scripts/location.bsh b/import/index_scripts/location.bsh deleted file mode 100644 index 9787855982e204f06e26ba59b8c63746b5b1e757..0000000000000000000000000000000000000000 --- a/import/index_scripts/location.bsh +++ /dev/null @@ -1,271 +0,0 @@ -/** - * Custom script to get latitude and longitude coordinates. - * Records can have multiple coordinates sets - * of points and/or rectangles. - * Points are represented by coordinate sets where N=S E=W. - * - * code adapted from xrosecky - Moravian Library - * https://github.com/moravianlibrary/VuFind-2.x/blob/master/import/index_scripts/geo.bsh - * and incorporates VuFind location.bsh functionality for GoogleMap display. - * - */ - -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import org.marc4j.marc.*; - -private static final Pattern COORDINATES_PATTERN = Pattern.compile("^([eEwWnNsS])(\\d{3})(\\d{2})(\\d{2})"); -private static final Pattern HDMSHDD_PATTERN = Pattern.compile("^([eEwWnNsS])(\\d+(\\.\\d+)?)"); -private static final Pattern PMDD_PATTERN = Pattern.compile("^([+-])(\\d+(\\.\\d+)?)"); - -/** - * Convert MARC coordinates into location_geo format. - * - * @param Record record - * @return List geo_coordinates - */ -public List getAllCoordinates(Record record) { - List geo_coordinates = new ArrayList(); - List list034 = record.getVariableFields("034"); - if (list034 != null) { - for (VariableField vf : list034) { - DataField df = (DataField) vf; - String d = df.getSubfield('d').getData(); - String e = df.getSubfield('e').getData(); - String f = df.getSubfield('f').getData(); - String g = df.getSubfield('g').getData(); - //System.out.println("raw Coords: "+d+" "+e+" "+f+" "+g); - - // Check to see if there are only 2 coordinates - // If so, copy them into the corresponding coordinate fields - if ((d !=null && (e == null || e.trim().equals(""))) && (f != null && (g==null || g.trim().equals("")))) { - e = d; - g = f; - } - if ((e !=null && (d == null || d.trim().equals(""))) && (g != null && (f==null || f.trim().equals("")))) { - d = e; - f = g; - } - - // Check and convert coordinates to +/- decimal degrees - Double west = convertCoordinate(d); - Double east = convertCoordinate(e); - Double north = convertCoordinate(f); - Double south = convertCoordinate(g); - - // New Format for indexing coordinates in Solr 5.0 - minX, maxX, maxY, minY - // Note - storage in Solr follows the WENS order, but display is WSEN order - String result = String.format("ENVELOPE(%s,%s,%s,%s)", new Object[] { west, east, north, south }); - - if (validateCoordinates(west, east, north, south)) { - geo_coordinates.add(result); - } - } - } - return geo_coordinates; -} - -/** - * Get point coordinates for GoogleMap display. - * - * @param Record record - * @return List coordinates - */ -public List getPointCoordinates(Record record) { - List coordinates = new ArrayList(); - List list034 = record.getVariableFields("034"); - if (list034 != null) { - for (VariableField vf : list034) { - DataField df = (DataField) vf; - String d = df.getSubfield('d').getData(); - String e = df.getSubfield('e').getData(); - String f = df.getSubfield('f').getData(); - String g = df.getSubfield('g').getData(); - - // Check to see if there are only 2 coordinates - if ((d !=null && (e == null || e.trim().equals(""))) && (f != null && (g==null || g.trim().equals("")))) { - Double long_val = convertCoordinate(d); - Double lat_val = convertCoordinate(f); - String longlatCoordinate = Double.toString(long_val) + ',' + Double.toString(lat_val); - coordinates.add(longlatCoordinate); - } - if ((e !=null && (d == null || d.trim().equals(""))) && (g != null && (f==null || f.trim().equals("")))) { - Double long_val = convertCoordinate(e); - Double lat_val = convertCoordinate(g); - String longlatCoordinate = Double.toString(long_val) + ',' + Double.toString(lat_val); - coordinates.add(longlatCoordinate); - } - // Check if N=S and E=W - if (d.equals(e) && f.equals(g)) { - Double long_val = convertCoordinate(d); - Double lat_val = convertCoordinate(f); - String longlatCoordinate = Double.toString(long_val) + ',' + Double.toString(lat_val); - coordinates.add(longlatCoordinate); - } - } - } - return coordinates; -} - -/** - * Get all available coordinates from the record. - * - * @param Record record - * @return List geo_coordinates - */ -public List getDisplayCoordinates(Record record) { - List geo_coordinates = new ArrayList(); - List list034 = record.getVariableFields("034"); - if (list034 != null) { - for (VariableField vf : list034) { - DataField df = (DataField) vf; - String west = df.getSubfield('d').getData(); - String east = df.getSubfield('e').getData(); - String north = df.getSubfield('f').getData(); - String south = df.getSubfield('g').getData(); - String result = String.format("%s %s %s %s", new Object[] { west, east, north, south }); - if (west != null || east != null || north != null || south != null) { - geo_coordinates.add(result); - } - } - } - return geo_coordinates; -} - -/** - * Check coordinate type HDMS HDD or +/-DD. - * - * @param String coordinateStr - * @return Double coordinate - */ -public Double convertCoordinate(String coordinateStr) { - Double coordinate = Double.NaN; - Matcher HDmatcher = HDMSHDD_PATTERN.matcher(coordinateStr); - Matcher PMDmatcher = PMDD_PATTERN.matcher(coordinateStr); - if (HDmatcher.matches()) { - String hemisphere = HDmatcher.group(1).toUpperCase(); - Double degrees = Double.parseDouble(HDmatcher.group(2)); - // Check for HDD or HDMS - if (hemisphere.equals("N") || hemisphere.equals("S")) { - if (degrees > 90) { - String hdmsCoordinate = hemisphere+"0"+HDmatcher.group(2); - coordinate = coordinateToDecimal(hdmsCoordinate); - } else { - coordinate = Double.parseDouble(HDmatcher.group(2)); - if (hemisphere.equals("S")) { - coordinate *= -1; - } - } - } - if (hemisphere.equals("E") || hemisphere.equals("W")) { - if (degrees > 180) { - String hdmsCoordinate = HDmatcher.group(0); - coordinate = coordinateToDecimal(hdmsCoordinate); - } else { - coordinate = Double.parseDouble(HDmatcher.group(2)); - if (hemisphere.equals("W")) { - coordinate *= -1; - } - } - } - return coordinate; - } else if (PMDmatcher.matches()) { - String hemisphere = PMDmatcher.group(1); - coordinate = Double.parseDouble(PMDmatcher.group(2)); - if (hemisphere.equals("-")) { - coordinate *= -1; - } - return coordinate; - } else { - return null; - } -} - -/** - * Convert HDMS coordinates to decimal degrees. - * - * @param String coordinateStr - * @return Double coordinate - */ -public Double coordinateToDecimal(String coordinateStr) { - Matcher matcher = COORDINATES_PATTERN.matcher(coordinateStr); - if (matcher.matches()) { - String hemisphere = matcher.group(1).toUpperCase(); - int degrees = Integer.parseInt(matcher.group(2)); - int minutes = Integer.parseInt(matcher.group(3)); - int seconds = Integer.parseInt(matcher.group(4)); - double coordinate = degrees + (minutes / 60.0) + (seconds / 3600.0); - if (hemisphere.equals("W") || hemisphere.equals("S")) { - coordinate *= -1; - } - return coordinate; - } - return null; -} - -/** - * Check decimal degree coordinates to make sure they are valid. - * - * @param Double west, east, north, south - * @return boolean - */ -public boolean validateCoordinates(Double west, Double east, Double north, Double south) { - if (west == null || east == null || north == null || south == null) { - return false; - } - if (west > 180.0 || west < -180.0 || east > 180.0 || east < -180.0) { - return false; - } - if (north > 90.0 || north < -90.0 || south > 90.0 || south < -90.0) { - return false; - } - if (north < south || west > east) { - return false; - } - return true; -} - -/** - * THIS FUNCTION HAS BEEN DEPRECATED. - * Determine the longitude and latitude of the items location. - * - * @param Record record - * @return String "longitude, latitude" - */ -public String getLongLat(Record record) { - // Check 034 subfield d and f - List fields = record.getVariableFields("034"); - Iterator fieldsIter = fields.iterator(); - if (fields != null) { - DataField physical; - while(fieldsIter.hasNext()) { - physical = (DataField) fieldsIter.next(); - String val; - - List subfields_d = physical.getSubfields('d'); - Iterator subfieldsIter_d = subfields_d.iterator(); - if (subfields_d != null) { - while (subfieldsIter_d.hasNext()) { - val = subfieldsIter_d.next().getData().trim(); - if (!val.matches("-?\\d+(.\\d+)?")) { - return null; - } - } - } - List subfields_f = physical.getSubfields('f'); - Iterator subfieldsIter_f = subfields_f.iterator(); - if (subfields_f != null) { - while (subfieldsIter_f.hasNext()) { - String val2 = subfieldsIter_f.next().getData().trim(); - if (!val2.matches("-?\\d+(.\\d+)?")) { - return null; - } - val = val + ',' + val2; - } - } - return val; - } - } - //otherwise return null - return null; -} \ No newline at end of file diff --git a/import/index_scripts/readingprograms.bsh b/import/index_scripts/readingprograms.bsh deleted file mode 100644 index 053fb299609d46965c29541cfcffbc89172a3677..0000000000000000000000000000000000000000 --- a/import/index_scripts/readingprograms.bsh +++ /dev/null @@ -1,66 +0,0 @@ -/** - * Reading program scripts courtesy of Chanel Wheeler - * - * Example usage: - * - * #### In marc_local.properties, insert this: - * arLvel = script(readingprograms.bsh), getARLevel, (pattern_map.level) - * rcLevel = script(readingprograms.bsh), getRCLevel, (pattern_map.level) - * pattern_map.level.pattern_0 = ([0-9]\\.[0-9]).*=>$1 - * - * #### In solr/biblio/conf/schema.xml (I'm not aware of any way to localize this), - * #### add this in the <types> section: - * <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" positionIncrementGap="0"/> - * - * #### In solr/biblio/conf/schema.xml, add this in the <fields> section - * <field name="arLevel" type="tfloat" indexed="true" stored="true"/> - * <field name="rcLevel" type="tfloat" indexed="true" stored="true"/> - */ -import org.marc4j.marc.Record; -import org.marc4j.marc.DataField; - -org.solrmarc.index.SolrIndexer indexer = null; - -/** - * Get reading level for Accelerated Reader items - * - * @param Record record - * @return String AR level - */ - public String getARLevel(Record record) { - List readingprograms = record.getVariableFields("526"); - if (readingprograms != null) { - Iterator rpIter = readingprograms.iterator(); - while(rpIter.hasNext()) { - DataField rp = (DataField) rpIter.next(); - if (rp.getSubfield('a') != null){ - if (rp.getSubfield('a').getData().toLowerCase().contains("accelerated reader")) { - return rp.getSubfield('c').getData(); - } - } - } - } - return null; - } - - /** - * Get reading level for Reading Counts items - * - * @param Record record - * @return String RC level - */ - public String getRCLevel(Record record) { - List readingprograms = record.getVariableFields("526"); - if (readingprograms != null) { - Iterator rpIter = readingprograms.iterator(); - while(rpIter.hasNext()) { - DataField rp = (DataField) rpIter.next(); - if (rp.getSubfield('a') != null){ - if (rp.getSubfield('a').getData().toLowerCase().contains("reading counts")) { - return rp.getSubfield('c').getData(); - } - } - } - } - return null; -} \ No newline at end of file diff --git a/import/marc_auth.properties b/import/marc_auth.properties index 8bb043d9967e99e46df50ee6252854b4aa58f57f..5ef540dee66792ca334980924be27811c04929c6 100644 --- a/import/marc_auth.properties +++ b/import/marc_auth.properties @@ -3,8 +3,8 @@ # marc.properties for the more commonly changed # # bibliographic settings. # ############################################################################### -id = script(getFirstNormalizedLCCN.bsh), getFirstNormalizedLCCN("010a") -lccn = script(getNormalizedLCCNs.bsh), getNormalizedLCCNs("010a") +id = custom, getFirstNormalizedLCCN("010a") +lccn = custom, getNormalizedLCCNs("010a") # These values should be overridden in a second properties file (for example, # see marc_auth_fast_*.properties). This allows the basic authority mappings to diff --git a/import/marc_local.properties b/import/marc_local.properties index 34a30a325fce0f12acf8c7c11b89eeb248c03d27..dcfbe82eb866a56a6d419f92305b9b5b6e7c3a12 100644 --- a/import/marc_local.properties +++ b/import/marc_local.properties @@ -9,33 +9,6 @@ #institution = "MyInstitution" #building = "Library A" -# Uncomment the following settings to use the .bsh scripts in import/scripts/ -# instead of the built-in SolrMarc functionality found in the .jar file. -# (by default, the scripts have the same behavior as the built-in functions, -# but the external scripts are easier to customize to your needs). -#format = script(format.bsh), getFormat, format_map.properties -#author = script(author.bsh), getAuthorsFilteredByRelator(100abcd:700abcd,100,firstAuthorRoles) -#author_variant = script(author.bsh), getAuthorInitialsFilteredByRelator(100a:700a,100,firstAuthorRoles) -#author_fuller = script(author.bsh), getAuthorsFilteredByRelator(100q:700q,100,firstAuthorRoles) -#author_role = script(author.bsh), getRelatorsFilteredByRelator(100abcd:700abcd,100,firstAuthorRoles) -#author2 = script(author.bsh), getAuthorsFilteredByRelator(700abcd,700,secondAuthorRoles) -#author2_variant = script(author.bsh), getAuthorInitialsFilteredByRelator(700a,700,secondAuthorRoles) -#author2_fuller = script(author.bsh), getAuthorsFilteredByRelator(700q,700,secondAuthorRoles) -#author2_role = script(author.bsh), getRelatorsFilteredByRelator(700abcd,700,secondAuthorRoles) -#author_corporate = script(author.bsh), getAuthorsFilteredByRelator(110ab:111abc:710ab:711ab,110:111:710:711,firstAuthorRoles|secondAuthorRoles) -#author_corporate_role = script(author.bsh), getRelatorsFilteredByRelator(110ab:111abc:710ab:711ab,110:111:710:711,firstAuthorRoles|secondAuthorRoles) -#author_sort = script(author.bsh), getFirstAuthorFilteredByRelator(100abcd:110ab:111abc:700abcd,100:110:111:700,firstAuthorRoles) -#callnumber-subject = script(callnumber.bsh), getCallNumberSubject(090a:050a), callnumber_subject_map.properties -#callnumber-label = script(callnumber.bsh), getCallNumberLabel(090a:050a) -#publisher = script(getpublishers.bsh), getPublishers -#publishDate = script(getdate.bsh), getDates -#illustrated = script(illustrated.bsh), isIllustrated -#dewey-hundreds = script(dewey.bsh), getDeweyNumber(082a:083a, 100), ddc22_map.properties(hundreds) -#dewey-tens = script(dewey.bsh), getDeweyNumber(082a:083a, 10), ddc22_map.properties(tens) -#dewey-ones = script(dewey.bsh), getDeweyNumber(082a:083a, 1), ddc22_map.properties(ones) -#dewey-full = script(dewey.bsh), getDeweySearchable(082a:083a) -#dewey-sort = script(dewey.bsh), getDeweySortable(082a:083a) - # Uncomment the following lines to track history of indexing times for RSS feeds, # OAI-PMH server and other updates. The parameter to these functions must be the # same fieldspec as the id setting above! @@ -63,7 +36,6 @@ #long_lat_display = custom, getDisplayCoordinates #long_lat_label = 034z - # Uncomment the following lines if you are indexing journal article data that uses # the 773 field to describe the journal containing the article. These settings # assume the records have been formatted according to recommendation 4.1 found at: