Skip to content
Snippets Groups Projects
Commit 58995256 authored by Demian Katz's avatar Demian Katz
Browse files

Resolving VUFIND-693 (Upgrade to SolrMarc 2.5).

parent 2fe79719
No related merge requests found
No preview for this file type
No preview for this file type
......@@ -9,7 +9,6 @@ import org.marc4j.marc.Record;
import org.marc4j.marc.DataField;
import java.util.regex.Pattern;
import java.io.*;
import org.ini4j.Ini;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.Document;
......@@ -17,6 +16,53 @@ import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
// define the base level indexer so that its methods can be called from the script.
// note that the SolrIndexer code will set this value before the script methods are called.
org.solrmarc.index.SolrIndexer indexer = null;
/**
* Load configurations for the full text parser. Return an array containing the
* parser type in the first element and the parser configuration in the second
* element.
*
* @return String[]
*/
public String[] getFulltextParserSettings()
{
String parserType = indexer.getConfigSetting(
"fulltext.ini", "General", "parser"
);
if (null != parserType) {
parserType = parserType.toLowerCase();
}
// Is Aperture active?
String aperturePath = indexer.getConfigSetting(
"fulltext.ini", "Aperture", "webcrawler"
);
if ((null == parserType && null != aperturePath)
|| (null != parserType && parserType.equals("aperture"))
) {
String[] array = { "aperture", aperturePath };
return array;
}
// Is Tika active?
String tikaPath = indexer.getConfigSetting(
"fulltext.ini", "Tika", "path"
);
if ((null == parserType && null != tikaPath)
|| (null != parserType && parserType.equals("tika"))
) {
String[] array = { "tika", tikaPath };
return array;
}
// No recognized parser found:
String[] array = { "none", null };
return array;
}
/**
* Extract full-text from the documents referenced in the tags
*
......@@ -28,9 +74,9 @@ import org.w3c.dom.NodeList;
public String getFulltext(Record record, String fieldSpec, String extension) {
String result = "";
// Get the path to Aperture web crawler (and return no text if it is unavailable)
String aperturePath = getAperturePath();
if (aperturePath == null) {
// Get the web crawler settings (and return no text if it is unavailable)
String[] parserSettings = getFulltextParserSettings();
if (parserSettings[0].equals("none")) {
return null;
}
......@@ -43,8 +89,8 @@ public String getFulltext(Record record, String fieldSpec, String extension) {
String current = fieldsIter.next();
// Filter by file extension
if (extension == null || current.endsWith(extension)) {
// Load the aperture output for each tag into a string
result = result + harvestWithAperture(current, aperturePath);
// Load the parser output for each tag into a string
result = result + harvestWithParser(current, parserSettings);
}
}
}
......@@ -74,57 +120,48 @@ public String getFulltext(Record record) {
}
/**
* Extract the Aperture path from fulltext.ini
* Clean up XML data generated by Aperture
*
* @return String Path to Aperture executables
* @param File The file to clean
* @return File A fixed version of the file
*/
public String getAperturePath() {
// Obtain path to Aperture from the fulltext.ini file:
Ini ini = new Ini();
// Find VuFind's home directory in the environment; if it's not available,
// try using a relative path on the assumption that we are currently in
// VuFind's root directory:
String vufindHome = System.getenv("VUFIND_HOME");
if (vufindHome == null) {
vufindHome = "";
}
public File sanitizeApertureOutput(File f)
{
//clean up the aperture xml output
File tempFile = File.createTempFile("buffer", ".tmp");
FileOutputStream fw = new FileOutputStream(tempFile);
Writer writer = new OutputStreamWriter(fw, "UTF8");
// TODO: update this to work with 2.0 config paths
String fulltextIniFile = vufindHome + "/web/conf/fulltext.ini";
File file = new File(fulltextIniFile);
try {
ini.load(new FileReader(fulltextIniFile));
} catch (Throwable e) {
dieWithError("Unable to access " + fulltextIniFile);
}
String aperturePath = ini.get("Aperture", "webcrawler");
if (aperturePath == null) {
return null;
}
// Drop comments if necessary:
int pos = aperturePath.indexOf(';');
if (pos >= 0) {
aperturePath = aperturePath.substring(0, pos).trim();
//delete this control character from the File and save
Reader fr = new FileReader(f);
BufferedReader br = new BufferedReader(fr);
while (br.ready()) {
writer.write(br.readLine().replaceAll("\u000C",""));
}
writer.close();
br.close();
fr.close();
// Strip wrapping quotes if necessary (the ini reader won't do this for us):
if (aperturePath.startsWith("\"")) {
aperturePath = aperturePath.substring(1, aperturePath.length());
}
if (aperturePath.endsWith("\"")) {
aperturePath = aperturePath.substring(0, aperturePath.length() - 1);
}
return tempFile;
}
return aperturePath;
/**
* Clean up bad characters in the full text.
*
* @param String Text to clean
* @return String Cleaned text
*/
public String sanitizeFullText(text)
{
String badChars = "[^\\u0009\\u000A\\u000D\\u0020-\\uD7FF\\uE000-\\uFFFD\\u10000-\\u10FFFF]+";
return text.replaceAll(badChars, " ");
}
/**
* Harvest the contents of a document file (PDF, Word, etc.) using Aperture.
* This method will only work if Aperture is properly configured in the
* web/conf/fulltext.ini file. Without proper configuration, this will
* simply return an empty string.
* fulltext.ini file. Without proper configuration, this will simply return an
* empty string.
*
* @param String The url extracted from the MARC tag.
* @param String The path to Aperture
......@@ -142,30 +179,84 @@ public String harvestWithAperture(url, aperturePath) {
String cmd = aperturePath + " -o " + f.getAbsolutePath().toString() + " -x " + url;
// Call Aperture
System.out.println("Loading fulltext from " + url + ". Please wait ...");
//System.out.println("Loading fulltext from " + url + ". Please wait ...");
Process p = Runtime.getRuntime().exec(cmd);
BufferedReader stdInput = new BufferedReader(new
InputStreamReader(p.getInputStream()));
while ((s = stdInput.readLine()) != null) {
System.out.println(s);
//System.out.println(s);
}
// Wait for Aperture to finish
p.waitFor();
// Parse Aperture XML output
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
DocumentBuilder db = dbf.newDocumentBuilder();
Document xmlDoc = db.parse(f);
NodeList nl = xmlDoc.getElementsByTagName("plainTextContent");
if(nl != null && nl.getLength() > 0) {
Node node = nl.item(0);
if (node.getNodeType() == Node.ELEMENT_NODE) {
plainText = plainText + node.getTextContent();
try {
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
DocumentBuilder db = dbf.newDocumentBuilder();
File tempFile = sanitizeApertureOutput(f);
Document xmlDoc = db.parse(tempFile);
NodeList nl = xmlDoc.getElementsByTagName("plainTextContent");
if(nl != null && nl.getLength() > 0) {
Node node = nl.item(0);
if (node.getNodeType() == Node.ELEMENT_NODE) {
plainText = plainText + node.getTextContent();
}
}
}
String badChars = "[^\\u0009\\u000A\\u000D\\u0020-\\uD7FF\\uE000-\\uFFFD\\u10000-\\u10FFFF]+";
plainText = plainText.replaceAll(badChars, " ");
plainText = sanitizeFullText(plainText);
// we'll hold onto the temp file if it failed to parse for debugging;
// only set it up to be deleted if we've made it this far successfully.
tempFile.deleteOnExit();
} catch (Exception e) {
indexer.getLogger().error("Error encountered parsing XML Document: " + e);
}
return plainText;
}
/**
* Harvest the contents of a document file (PDF, Word, etc.) using Tika.
* This method will only work if Tika is properly configured in the fulltext.ini
* file. Without proper configuration, this will simply return an empty string.
*
* @param String The url extracted from the MARC tag.
* @param String The path to Tika
* @return String The full-text
*/
public String harvestWithTika(url, scraperPath) {
String plainText = "";
// Construct the command
String cmd = "java -jar " + scraperPath + " -t -eUTF8 " + url;
// Call our scraper
//System.out.println("Loading fulltext from " + url + ". Please wait ...");
Process p = Runtime.getRuntime().exec(cmd);
BufferedReader stdInput = new BufferedReader(new
InputStreamReader(p.getInputStream(), "UTF8"));
// We'll build the string from the command output
StringBuilder stringBuilder= new StringBuilder();
while ((s = stdInput.readLine()) != null) {
stringBuilder.append(s);
}
return sanitizeFullText(stringBuilder.toString());
}
/**
* Harvest the contents of a document file (PDF, Word, etc.) using the active parser.
*
* @param String The url extracted from the MARC tag.
* @param String[] Configuration settings from getFulltextParserSettings.
* @return String The full-text
*/
public String harvestWithParser(url, settings) {
if (settings[0].equals("aperture")) {
return harvestWithAperture(url, settings[1]);
} else if (settings[0].equals("tika")) {
return harvestWithTika(url, settings[1]);
}
return null;
}
\ No newline at end of file
......@@ -23,7 +23,7 @@ log4j.appender.stdout.target=System.out
# with a max file size of 100KB
# and keep 1 previous log file
log4j.appender.file=org.apache.log4j.RollingFileAppender
log4j.appender.file.File=solrmarc.log
log4j.appender.file.File=${one-jar.home.dir}solrmarc.log
log4j.appender.file.MaxFileSize=100KB
log4j.appender.file.MaxBackupIndex=1
log4j.appender.file.layout=org.apache.log4j.PatternLayout
......
......@@ -83,4 +83,4 @@ oclc_num = 035a, (pattern_map.oclc_num)
pattern_map.oclc_num.pattern_0 = \\(OCoLC\\)[^0-9]*[0]*([0-9]+)=>$1
pattern_map.oclc_num.pattern_1 = ocm[0]*([0-9]+)[ ]*[0-9]*=>$1
pattern_map.oclc_num.pattern_2 = ocn[0]*([0-9]+).*=>$1
pattern_map.oclc_num.pattern_3 = on[0]*([0-9]+).*=>$1
\ No newline at end of file
pattern_map.oclc_num.pattern_3 = on[0]*([0-9]+).*=>$1
......@@ -39,8 +39,9 @@
# text, HTML, etc.) The first parameter is a fieldspec showing which fields to use
# for URL retrieval. The second parameter is optional -- if included, only files
# matching the specified suffix will be indexed. Note that this functionality
# depends on Aperture being installed on your system. See the wiki for details:
# http://vufind.org/wiki/aperture
# depends on a full text tool being installed on your system. See the wiki for
# details:
# http://vufind.org/wiki/importing_records#indexing_full_text
#fulltext = custom, getFulltext(856u, pdf)
# Uncomment the following line if you want to index latitude/longitude data for
......@@ -62,4 +63,4 @@
#container_issue = 773l
#container_start_page = 773q
#container_reference = 773g
#container_title = 773s
\ No newline at end of file
#container_title = 773s
......@@ -25,12 +25,12 @@ ContentType.Diorama = Physical Object
ContentType.Filmstrip = Visual Materials
ContentType.FlashCard = Visual Materials
ContentType.Game = Physical Object
ContentType.GovernmentDocumentFederal = Government Document|Gov Doc Fed
ContentType.GovernmentDocumentState = Government Document|Gov Doc State
ContentType.GovernmentDocumentStateUniversity = Gov Doc Univ
ContentType.GovernmentDocumentLocal = Gov Doc Local
ContentType.GovernmentDocumentInternational = Government Document|Gov Doc Intl
ContentType.GovernmentDocumentOther = Gov Doc Other
ContentType.GovernmentDocumentFederal = Government Document
ContentType.GovernmentDocumentState = Government Document
ContentType.GovernmentDocumentStateUniversity = null
ContentType.GovernmentDocumentLocal = null
ContentType.GovernmentDocumentInternational = Government Document
ContentType.GovernmentDocumentOther = null
ContentType.Globe = Map|Globe|Physical Object
ContentType.Graphic = Visual Materials
ContentType.Image = Visual Materials
......@@ -51,9 +51,9 @@ ContentType.MotionPicture = Film
ContentType.MusicalScore = Musical Score
ContentType.MusicalScoreManuscript = Musical Score|Manuscript
ContentType.MusicRecording = Sound Recording
ContentType.Newspaper = Journal/Magazine
ContentType.Newspaper = Journal/Magazine|Newspaper
ContentType.Pamphlet = Visual Materials
ContentType.Periodical = Journal/Magazine
ContentType.Periodical = Journal/Magazine|Periodical
ContentType.PhysicalObject = Physical Object
ContentType.Picture = Visual Materials
ContentType.ProjectedMedium = Visual Materials
......@@ -76,6 +76,7 @@ ContentType.Website = Online|Computer Resource
MediaType.ActivityCard = Visual Materials
MediaType.Atlas = Map|Atlas
MediaType.Braille = Braille
MediaType.Broadside = Broadside
MediaType.Chart = Visual Materials
MediaType.Collage = Visual Materials
MediaType.ComputerCard = Computer Media
......@@ -154,6 +155,7 @@ MediaType.SoundDisc = null
MediaType.SoundDiscCD = CD
MediaType.SoundDiscLP = LP
MediaType.SoundRecordingOther = Other Media
MediaType.SoundRecordingOnline = Streaming Audio
MediaType.SoundRoll = Roll
MediaType.SoundTapeReel = Tape Reel
MediaType.SoundTrackFilm = Sound Track Film
......@@ -204,8 +206,8 @@ FormOfItem.Microopaque = Microform
FormOfItem.PrintLarge = Large Print
FormOfItem.Braille = Braille
FormOfItem.Online = Online
FormOfItem.ElectronicDirect = Computer Resource
FormOfItem.Electronic = Computer Resource
FormOfItem.ElectronicDirect = null
FormOfItem.Electronic = null
FormOfItem.Print = Print
#CombinedType
......
......@@ -76,6 +76,7 @@ ContentType.Website = (Leader[67]=as OR 006[0]=s) AND (008[21]=w OR 006[4]=w)
MediaType.ActivityCard = 007[01]=ka
MediaType.Atlas = 007[01]=ad
MediaType.Braille = 007[01]=fb OR 007[01]=tc
MediaType.Broadside = Heuristic (300 field)
MediaType.Chart = 007[01]=kn
MediaType.Collage = 007[01]=kc
MediaType.ComputerCard = 007[01]=ck
......@@ -156,6 +157,7 @@ MediaType.SoundDisc = 007[01]=sd
MediaType.SoundDiscCD = 007[01]=sd AND 007[3]=f
MediaType.SoundDiscLP = 007[01]=sd AND 007[3]=abde
MediaType.SoundRecordingOther = 007[0]=s 007[1]!={cdefgirqstw}
MediaType.SoundRecordingOnline = 007[0]=s 007[1]=z AND Online
MediaType.SoundRoll = 007[01]=sq
MediaType.SoundTapeReel = 007[01]=st
MediaType.SoundTrackFilm = 007[01]=si
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment