Skip to content
Snippets Groups Projects
Commit 58995256 authored by Demian Katz's avatar Demian Katz
Browse files

Resolving VUFIND-693 (Upgrade to SolrMarc 2.5).

parent 2fe79719
No related merge requests found
No preview for this file type
No preview for this file type
...@@ -9,7 +9,6 @@ import org.marc4j.marc.Record; ...@@ -9,7 +9,6 @@ import org.marc4j.marc.Record;
import org.marc4j.marc.DataField; import org.marc4j.marc.DataField;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.io.*; import java.io.*;
import org.ini4j.Ini;
import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.Document; import org.w3c.dom.Document;
...@@ -17,6 +16,53 @@ import org.w3c.dom.Element; ...@@ -17,6 +16,53 @@ import org.w3c.dom.Element;
import org.w3c.dom.Node; import org.w3c.dom.Node;
import org.w3c.dom.NodeList; import org.w3c.dom.NodeList;
// define the base level indexer so that its methods can be called from the script.
// note that the SolrIndexer code will set this value before the script methods are called.
org.solrmarc.index.SolrIndexer indexer = null;
/**
* Load configurations for the full text parser. Return an array containing the
* parser type in the first element and the parser configuration in the second
* element.
*
* @return String[]
*/
public String[] getFulltextParserSettings()
{
String parserType = indexer.getConfigSetting(
"fulltext.ini", "General", "parser"
);
if (null != parserType) {
parserType = parserType.toLowerCase();
}
// Is Aperture active?
String aperturePath = indexer.getConfigSetting(
"fulltext.ini", "Aperture", "webcrawler"
);
if ((null == parserType && null != aperturePath)
|| (null != parserType && parserType.equals("aperture"))
) {
String[] array = { "aperture", aperturePath };
return array;
}
// Is Tika active?
String tikaPath = indexer.getConfigSetting(
"fulltext.ini", "Tika", "path"
);
if ((null == parserType && null != tikaPath)
|| (null != parserType && parserType.equals("tika"))
) {
String[] array = { "tika", tikaPath };
return array;
}
// No recognized parser found:
String[] array = { "none", null };
return array;
}
/** /**
* Extract full-text from the documents referenced in the tags * Extract full-text from the documents referenced in the tags
* *
...@@ -28,9 +74,9 @@ import org.w3c.dom.NodeList; ...@@ -28,9 +74,9 @@ import org.w3c.dom.NodeList;
public String getFulltext(Record record, String fieldSpec, String extension) { public String getFulltext(Record record, String fieldSpec, String extension) {
String result = ""; String result = "";
// Get the path to Aperture web crawler (and return no text if it is unavailable) // Get the web crawler settings (and return no text if it is unavailable)
String aperturePath = getAperturePath(); String[] parserSettings = getFulltextParserSettings();
if (aperturePath == null) { if (parserSettings[0].equals("none")) {
return null; return null;
} }
...@@ -43,8 +89,8 @@ public String getFulltext(Record record, String fieldSpec, String extension) { ...@@ -43,8 +89,8 @@ public String getFulltext(Record record, String fieldSpec, String extension) {
String current = fieldsIter.next(); String current = fieldsIter.next();
// Filter by file extension // Filter by file extension
if (extension == null || current.endsWith(extension)) { if (extension == null || current.endsWith(extension)) {
// Load the aperture output for each tag into a string // Load the parser output for each tag into a string
result = result + harvestWithAperture(current, aperturePath); result = result + harvestWithParser(current, parserSettings);
} }
} }
} }
...@@ -74,57 +120,48 @@ public String getFulltext(Record record) { ...@@ -74,57 +120,48 @@ public String getFulltext(Record record) {
} }
/** /**
* Extract the Aperture path from fulltext.ini * Clean up XML data generated by Aperture
* *
* @return String Path to Aperture executables * @param File The file to clean
* @return File A fixed version of the file
*/ */
public String getAperturePath() { public File sanitizeApertureOutput(File f)
// Obtain path to Aperture from the fulltext.ini file: {
Ini ini = new Ini(); //clean up the aperture xml output
File tempFile = File.createTempFile("buffer", ".tmp");
// Find VuFind's home directory in the environment; if it's not available, FileOutputStream fw = new FileOutputStream(tempFile);
// try using a relative path on the assumption that we are currently in Writer writer = new OutputStreamWriter(fw, "UTF8");
// VuFind's root directory:
String vufindHome = System.getenv("VUFIND_HOME");
if (vufindHome == null) {
vufindHome = "";
}
// TODO: update this to work with 2.0 config paths //delete this control character from the File and save
String fulltextIniFile = vufindHome + "/web/conf/fulltext.ini"; Reader fr = new FileReader(f);
File file = new File(fulltextIniFile); BufferedReader br = new BufferedReader(fr);
try { while (br.ready()) {
ini.load(new FileReader(fulltextIniFile)); writer.write(br.readLine().replaceAll("\u000C",""));
} catch (Throwable e) {
dieWithError("Unable to access " + fulltextIniFile);
}
String aperturePath = ini.get("Aperture", "webcrawler");
if (aperturePath == null) {
return null;
}
// Drop comments if necessary:
int pos = aperturePath.indexOf(';');
if (pos >= 0) {
aperturePath = aperturePath.substring(0, pos).trim();
} }
writer.close();
br.close();
fr.close();
// Strip wrapping quotes if necessary (the ini reader won't do this for us): return tempFile;
if (aperturePath.startsWith("\"")) { }
aperturePath = aperturePath.substring(1, aperturePath.length());
}
if (aperturePath.endsWith("\"")) {
aperturePath = aperturePath.substring(0, aperturePath.length() - 1);
}
return aperturePath; /**
* Clean up bad characters in the full text.
*
* @param String Text to clean
* @return String Cleaned text
*/
public String sanitizeFullText(text)
{
String badChars = "[^\\u0009\\u000A\\u000D\\u0020-\\uD7FF\\uE000-\\uFFFD\\u10000-\\u10FFFF]+";
return text.replaceAll(badChars, " ");
} }
/** /**
* Harvest the contents of a document file (PDF, Word, etc.) using Aperture. * Harvest the contents of a document file (PDF, Word, etc.) using Aperture.
* This method will only work if Aperture is properly configured in the * This method will only work if Aperture is properly configured in the
* web/conf/fulltext.ini file. Without proper configuration, this will * fulltext.ini file. Without proper configuration, this will simply return an
* simply return an empty string. * empty string.
* *
* @param String The url extracted from the MARC tag. * @param String The url extracted from the MARC tag.
* @param String The path to Aperture * @param String The path to Aperture
...@@ -142,30 +179,84 @@ public String harvestWithAperture(url, aperturePath) { ...@@ -142,30 +179,84 @@ public String harvestWithAperture(url, aperturePath) {
String cmd = aperturePath + " -o " + f.getAbsolutePath().toString() + " -x " + url; String cmd = aperturePath + " -o " + f.getAbsolutePath().toString() + " -x " + url;
// Call Aperture // Call Aperture
System.out.println("Loading fulltext from " + url + ". Please wait ..."); //System.out.println("Loading fulltext from " + url + ". Please wait ...");
Process p = Runtime.getRuntime().exec(cmd); Process p = Runtime.getRuntime().exec(cmd);
BufferedReader stdInput = new BufferedReader(new BufferedReader stdInput = new BufferedReader(new
InputStreamReader(p.getInputStream())); InputStreamReader(p.getInputStream()));
while ((s = stdInput.readLine()) != null) { while ((s = stdInput.readLine()) != null) {
System.out.println(s); //System.out.println(s);
} }
// Wait for Aperture to finish // Wait for Aperture to finish
p.waitFor(); p.waitFor();
// Parse Aperture XML output // Parse Aperture XML output
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); try {
DocumentBuilder db = dbf.newDocumentBuilder(); DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
Document xmlDoc = db.parse(f); DocumentBuilder db = dbf.newDocumentBuilder();
NodeList nl = xmlDoc.getElementsByTagName("plainTextContent"); File tempFile = sanitizeApertureOutput(f);
if(nl != null && nl.getLength() > 0) { Document xmlDoc = db.parse(tempFile);
Node node = nl.item(0); NodeList nl = xmlDoc.getElementsByTagName("plainTextContent");
if (node.getNodeType() == Node.ELEMENT_NODE) { if(nl != null && nl.getLength() > 0) {
plainText = plainText + node.getTextContent(); Node node = nl.item(0);
if (node.getNodeType() == Node.ELEMENT_NODE) {
plainText = plainText + node.getTextContent();
}
} }
}
String badChars = "[^\\u0009\\u000A\\u000D\\u0020-\\uD7FF\\uE000-\\uFFFD\\u10000-\\u10FFFF]+"; plainText = sanitizeFullText(plainText);
plainText = plainText.replaceAll(badChars, " ");
// we'll hold onto the temp file if it failed to parse for debugging;
// only set it up to be deleted if we've made it this far successfully.
tempFile.deleteOnExit();
} catch (Exception e) {
indexer.getLogger().error("Error encountered parsing XML Document: " + e);
}
return plainText; return plainText;
} }
/**
* Harvest the contents of a document file (PDF, Word, etc.) using Tika.
* This method will only work if Tika is properly configured in the fulltext.ini
* file. Without proper configuration, this will simply return an empty string.
*
* @param String The url extracted from the MARC tag.
* @param String The path to Tika
* @return String The full-text
*/
public String harvestWithTika(url, scraperPath) {
String plainText = "";
// Construct the command
String cmd = "java -jar " + scraperPath + " -t -eUTF8 " + url;
// Call our scraper
//System.out.println("Loading fulltext from " + url + ". Please wait ...");
Process p = Runtime.getRuntime().exec(cmd);
BufferedReader stdInput = new BufferedReader(new
InputStreamReader(p.getInputStream(), "UTF8"));
// We'll build the string from the command output
StringBuilder stringBuilder= new StringBuilder();
while ((s = stdInput.readLine()) != null) {
stringBuilder.append(s);
}
return sanitizeFullText(stringBuilder.toString());
}
/**
* Harvest the contents of a document file (PDF, Word, etc.) using the active parser.
*
* @param String The url extracted from the MARC tag.
* @param String[] Configuration settings from getFulltextParserSettings.
* @return String The full-text
*/
public String harvestWithParser(url, settings) {
if (settings[0].equals("aperture")) {
return harvestWithAperture(url, settings[1]);
} else if (settings[0].equals("tika")) {
return harvestWithTika(url, settings[1]);
}
return null;
}
\ No newline at end of file
...@@ -23,7 +23,7 @@ log4j.appender.stdout.target=System.out ...@@ -23,7 +23,7 @@ log4j.appender.stdout.target=System.out
# with a max file size of 100KB # with a max file size of 100KB
# and keep 1 previous log file # and keep 1 previous log file
log4j.appender.file=org.apache.log4j.RollingFileAppender log4j.appender.file=org.apache.log4j.RollingFileAppender
log4j.appender.file.File=solrmarc.log log4j.appender.file.File=${one-jar.home.dir}solrmarc.log
log4j.appender.file.MaxFileSize=100KB log4j.appender.file.MaxFileSize=100KB
log4j.appender.file.MaxBackupIndex=1 log4j.appender.file.MaxBackupIndex=1
log4j.appender.file.layout=org.apache.log4j.PatternLayout log4j.appender.file.layout=org.apache.log4j.PatternLayout
......
...@@ -83,4 +83,4 @@ oclc_num = 035a, (pattern_map.oclc_num) ...@@ -83,4 +83,4 @@ oclc_num = 035a, (pattern_map.oclc_num)
pattern_map.oclc_num.pattern_0 = \\(OCoLC\\)[^0-9]*[0]*([0-9]+)=>$1 pattern_map.oclc_num.pattern_0 = \\(OCoLC\\)[^0-9]*[0]*([0-9]+)=>$1
pattern_map.oclc_num.pattern_1 = ocm[0]*([0-9]+)[ ]*[0-9]*=>$1 pattern_map.oclc_num.pattern_1 = ocm[0]*([0-9]+)[ ]*[0-9]*=>$1
pattern_map.oclc_num.pattern_2 = ocn[0]*([0-9]+).*=>$1 pattern_map.oclc_num.pattern_2 = ocn[0]*([0-9]+).*=>$1
pattern_map.oclc_num.pattern_3 = on[0]*([0-9]+).*=>$1 pattern_map.oclc_num.pattern_3 = on[0]*([0-9]+).*=>$1
\ No newline at end of file
...@@ -39,8 +39,9 @@ ...@@ -39,8 +39,9 @@
# text, HTML, etc.) The first parameter is a fieldspec showing which fields to use # text, HTML, etc.) The first parameter is a fieldspec showing which fields to use
# for URL retrieval. The second parameter is optional -- if included, only files # for URL retrieval. The second parameter is optional -- if included, only files
# matching the specified suffix will be indexed. Note that this functionality # matching the specified suffix will be indexed. Note that this functionality
# depends on Aperture being installed on your system. See the wiki for details: # depends on a full text tool being installed on your system. See the wiki for
# http://vufind.org/wiki/aperture # details:
# http://vufind.org/wiki/importing_records#indexing_full_text
#fulltext = custom, getFulltext(856u, pdf) #fulltext = custom, getFulltext(856u, pdf)
# Uncomment the following line if you want to index latitude/longitude data for # Uncomment the following line if you want to index latitude/longitude data for
...@@ -62,4 +63,4 @@ ...@@ -62,4 +63,4 @@
#container_issue = 773l #container_issue = 773l
#container_start_page = 773q #container_start_page = 773q
#container_reference = 773g #container_reference = 773g
#container_title = 773s #container_title = 773s
\ No newline at end of file
...@@ -25,12 +25,12 @@ ContentType.Diorama = Physical Object ...@@ -25,12 +25,12 @@ ContentType.Diorama = Physical Object
ContentType.Filmstrip = Visual Materials ContentType.Filmstrip = Visual Materials
ContentType.FlashCard = Visual Materials ContentType.FlashCard = Visual Materials
ContentType.Game = Physical Object ContentType.Game = Physical Object
ContentType.GovernmentDocumentFederal = Government Document|Gov Doc Fed ContentType.GovernmentDocumentFederal = Government Document
ContentType.GovernmentDocumentState = Government Document|Gov Doc State ContentType.GovernmentDocumentState = Government Document
ContentType.GovernmentDocumentStateUniversity = Gov Doc Univ ContentType.GovernmentDocumentStateUniversity = null
ContentType.GovernmentDocumentLocal = Gov Doc Local ContentType.GovernmentDocumentLocal = null
ContentType.GovernmentDocumentInternational = Government Document|Gov Doc Intl ContentType.GovernmentDocumentInternational = Government Document
ContentType.GovernmentDocumentOther = Gov Doc Other ContentType.GovernmentDocumentOther = null
ContentType.Globe = Map|Globe|Physical Object ContentType.Globe = Map|Globe|Physical Object
ContentType.Graphic = Visual Materials ContentType.Graphic = Visual Materials
ContentType.Image = Visual Materials ContentType.Image = Visual Materials
...@@ -51,9 +51,9 @@ ContentType.MotionPicture = Film ...@@ -51,9 +51,9 @@ ContentType.MotionPicture = Film
ContentType.MusicalScore = Musical Score ContentType.MusicalScore = Musical Score
ContentType.MusicalScoreManuscript = Musical Score|Manuscript ContentType.MusicalScoreManuscript = Musical Score|Manuscript
ContentType.MusicRecording = Sound Recording ContentType.MusicRecording = Sound Recording
ContentType.Newspaper = Journal/Magazine ContentType.Newspaper = Journal/Magazine|Newspaper
ContentType.Pamphlet = Visual Materials ContentType.Pamphlet = Visual Materials
ContentType.Periodical = Journal/Magazine ContentType.Periodical = Journal/Magazine|Periodical
ContentType.PhysicalObject = Physical Object ContentType.PhysicalObject = Physical Object
ContentType.Picture = Visual Materials ContentType.Picture = Visual Materials
ContentType.ProjectedMedium = Visual Materials ContentType.ProjectedMedium = Visual Materials
...@@ -76,6 +76,7 @@ ContentType.Website = Online|Computer Resource ...@@ -76,6 +76,7 @@ ContentType.Website = Online|Computer Resource
MediaType.ActivityCard = Visual Materials MediaType.ActivityCard = Visual Materials
MediaType.Atlas = Map|Atlas MediaType.Atlas = Map|Atlas
MediaType.Braille = Braille MediaType.Braille = Braille
MediaType.Broadside = Broadside
MediaType.Chart = Visual Materials MediaType.Chart = Visual Materials
MediaType.Collage = Visual Materials MediaType.Collage = Visual Materials
MediaType.ComputerCard = Computer Media MediaType.ComputerCard = Computer Media
...@@ -154,6 +155,7 @@ MediaType.SoundDisc = null ...@@ -154,6 +155,7 @@ MediaType.SoundDisc = null
MediaType.SoundDiscCD = CD MediaType.SoundDiscCD = CD
MediaType.SoundDiscLP = LP MediaType.SoundDiscLP = LP
MediaType.SoundRecordingOther = Other Media MediaType.SoundRecordingOther = Other Media
MediaType.SoundRecordingOnline = Streaming Audio
MediaType.SoundRoll = Roll MediaType.SoundRoll = Roll
MediaType.SoundTapeReel = Tape Reel MediaType.SoundTapeReel = Tape Reel
MediaType.SoundTrackFilm = Sound Track Film MediaType.SoundTrackFilm = Sound Track Film
...@@ -204,8 +206,8 @@ FormOfItem.Microopaque = Microform ...@@ -204,8 +206,8 @@ FormOfItem.Microopaque = Microform
FormOfItem.PrintLarge = Large Print FormOfItem.PrintLarge = Large Print
FormOfItem.Braille = Braille FormOfItem.Braille = Braille
FormOfItem.Online = Online FormOfItem.Online = Online
FormOfItem.ElectronicDirect = Computer Resource FormOfItem.ElectronicDirect = null
FormOfItem.Electronic = Computer Resource FormOfItem.Electronic = null
FormOfItem.Print = Print FormOfItem.Print = Print
#CombinedType #CombinedType
......
...@@ -76,6 +76,7 @@ ContentType.Website = (Leader[67]=as OR 006[0]=s) AND (008[21]=w OR 006[4]=w) ...@@ -76,6 +76,7 @@ ContentType.Website = (Leader[67]=as OR 006[0]=s) AND (008[21]=w OR 006[4]=w)
MediaType.ActivityCard = 007[01]=ka MediaType.ActivityCard = 007[01]=ka
MediaType.Atlas = 007[01]=ad MediaType.Atlas = 007[01]=ad
MediaType.Braille = 007[01]=fb OR 007[01]=tc MediaType.Braille = 007[01]=fb OR 007[01]=tc
MediaType.Broadside = Heuristic (300 field)
MediaType.Chart = 007[01]=kn MediaType.Chart = 007[01]=kn
MediaType.Collage = 007[01]=kc MediaType.Collage = 007[01]=kc
MediaType.ComputerCard = 007[01]=ck MediaType.ComputerCard = 007[01]=ck
...@@ -156,6 +157,7 @@ MediaType.SoundDisc = 007[01]=sd ...@@ -156,6 +157,7 @@ MediaType.SoundDisc = 007[01]=sd
MediaType.SoundDiscCD = 007[01]=sd AND 007[3]=f MediaType.SoundDiscCD = 007[01]=sd AND 007[3]=f
MediaType.SoundDiscLP = 007[01]=sd AND 007[3]=abde MediaType.SoundDiscLP = 007[01]=sd AND 007[3]=abde
MediaType.SoundRecordingOther = 007[0]=s 007[1]!={cdefgirqstw} MediaType.SoundRecordingOther = 007[0]=s 007[1]!={cdefgirqstw}
MediaType.SoundRecordingOnline = 007[0]=s 007[1]=z AND Online
MediaType.SoundRoll = 007[01]=sq MediaType.SoundRoll = 007[01]=sq
MediaType.SoundTapeReel = 007[01]=st MediaType.SoundTapeReel = 007[01]=st
MediaType.SoundTrackFilm = 007[01]=si MediaType.SoundTrackFilm = 007[01]=si
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment