Resolving VUFIND-693 (Upgrade to SolrMarc 2.5).

58995256 · Demian Katz · 2fe79719 · 58995256 · 58995256 · 58995256
Commit 58995256 authored 12 years ago by Demian Katz
--- a/import/SolrMarc.jar
+++ b/import/SolrMarc.jar
--- a/import/VuFindIndexer.jar
+++ b/import/VuFindIndexer.jar
--- a/import/index_scripts/getFulltext.bsh
+++ b/import/index_scripts/getFulltext.bsh
@@ -9,7 +9,6 @@ import org.marc4j.marc.Record;
 import org.marc4j.marc.DataField;
 import java.util.regex.Pattern;
 import java.io.*;
-import org.ini4j.Ini;
 import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
 import org.w3c.dom.Document;
@@ -17,6 +16,53 @@ import org.w3c.dom.Element;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;

+// define the base level indexer so that its methods can be called from the script.
+// note that the SolrIndexer code will set this value before the script methods are called.
+org.solrmarc.index.SolrIndexer indexer = null;
+
+/**
+ * Load configurations for the full text parser.  Return an array containing the
+ * parser type in the first element and the parser configuration in the second
+ * element.
+ *
+ * @return String[]
+ */
+public String[] getFulltextParserSettings()
+{
+    String parserType = indexer.getConfigSetting(
+        "fulltext.ini", "General", "parser"
+    );
+    if (null != parserType) {
+        parserType = parserType.toLowerCase();
+    }
+
+    // Is Aperture active?
+    String aperturePath = indexer.getConfigSetting(
+        "fulltext.ini", "Aperture", "webcrawler"
+    );
+    if ((null == parserType && null != aperturePath)
+        || (null != parserType && parserType.equals("aperture"))
+    ) {
+        String[] array = { "aperture", aperturePath };
+        return array;
+    }
+
+    // Is Tika active?
+    String tikaPath = indexer.getConfigSetting(
+        "fulltext.ini", "Tika", "path"
+    );
+    if ((null == parserType && null != tikaPath)
+        || (null != parserType && parserType.equals("tika"))
+    ) {
+        String[] array = { "tika", tikaPath };
+        return array;
+    }
+
+    // No recognized parser found:
+    String[] array = { "none", null };
+    return array;
+}
+
 /**
 * Extract full-text from the documents referenced in the tags
 *
@@ -28,9 +74,9 @@ import org.w3c.dom.NodeList;
 public String getFulltext(Record record, String fieldSpec, String extension) {
    String result = "";

-    // Get the path to Aperture web crawler (and return no text if it is unavailable)
-    String aperturePath = getAperturePath();
-    if (aperturePath == null) {
+    // Get the web crawler settings (and return no text if it is unavailable)
+    String[] parserSettings = getFulltextParserSettings();
+    if (parserSettings[0].equals("none")) {
        return null;
    }

@@ -43,8 +89,8 @@ public String getFulltext(Record record, String fieldSpec, String extension) {
            String current = fieldsIter.next();
            // Filter by file extension
            if (extension == null || current.endsWith(extension)) {
-                // Load the aperture output for each tag into a string
-                result = result + harvestWithAperture(current, aperturePath);
+                // Load the parser output for each tag into a string
+                result = result + harvestWithParser(current, parserSettings);
            }
        }
    }
@@ -74,57 +120,48 @@ public String getFulltext(Record record) {
 }

 /**
- * Extract the Aperture path from fulltext.ini
+ * Clean up XML data generated by Aperture
 *
- * @return String          Path to Aperture executables
+ * @param File The file to clean
+ * @return File A fixed version of the file
 */
-public String getAperturePath() {
-    // Obtain path to Aperture from the fulltext.ini file:
-    Ini ini = new Ini();
-
-    // Find VuFind's home directory in the environment; if it's not available,
-    // try using a relative path on the assumption that we are currently in
-    // VuFind's root directory:
-    String vufindHome = System.getenv("VUFIND_HOME");
-    if (vufindHome == null) {
-        vufindHome = "";
-    }
+public File sanitizeApertureOutput(File f)
+{
+    //clean up the aperture xml output
+    File tempFile = File.createTempFile("buffer", ".tmp");
+    FileOutputStream fw = new FileOutputStream(tempFile);
+    Writer writer = new OutputStreamWriter(fw, "UTF8");

-    // TODO: update this to work with 2.0 config paths
-    String fulltextIniFile = vufindHome + "/web/conf/fulltext.ini";
-    File file = new File(fulltextIniFile);
-    try {
-        ini.load(new FileReader(fulltextIniFile));
-    } catch (Throwable e) {
-        dieWithError("Unable to access " + fulltextIniFile);
-    }
-    String aperturePath = ini.get("Aperture", "webcrawler");
-    if (aperturePath == null) {
-        return null;
-    }
-
-    // Drop comments if necessary:
-    int pos = aperturePath.indexOf(';');
-    if (pos >= 0) {
-        aperturePath = aperturePath.substring(0, pos).trim();
+    //delete this control character from the File and save
+    Reader fr = new FileReader(f);
+    BufferedReader br = new BufferedReader(fr);
+    while (br.ready()) {
+        writer.write(br.readLine().replaceAll("\u000C",""));
    }
+    writer.close();
+    br.close();
+    fr.close();

-    // Strip wrapping quotes if necessary (the ini reader won't do this for us):
-    if (aperturePath.startsWith("\"")) {
-        aperturePath = aperturePath.substring(1, aperturePath.length());
-    }
-    if (aperturePath.endsWith("\"")) {
-        aperturePath = aperturePath.substring(0, aperturePath.length() - 1);
-    }
+    return tempFile;
+}

-    return aperturePath;
+/**
+ * Clean up bad characters in the full text.
+ *
+ * @param String Text to clean
+ * @return String Cleaned text
+ */
+public String sanitizeFullText(text)
+{
+    String badChars = "[^\\u0009\\u000A\\u000D\\u0020-\\uD7FF\\uE000-\\uFFFD\\u10000-\\u10FFFF]+";
+    return text.replaceAll(badChars, " ");
 }

 /**
 * Harvest the contents of a document file (PDF, Word, etc.) using Aperture.
 * This method will only work if Aperture is properly configured in the
- * web/conf/fulltext.ini file.  Without proper configuration, this will
- * simply return an empty string.
+ * fulltext.ini file.  Without proper configuration, this will simply return an
+ * empty string.
 *
 * @param String The url extracted from the MARC tag.
 * @param String The path to Aperture
@@ -142,30 +179,84 @@ public String harvestWithAperture(url, aperturePath) {
    String cmd = aperturePath + " -o " + f.getAbsolutePath().toString()  + " -x " + url;

    // Call Aperture
-    System.out.println("Loading fulltext from " + url + ". Please wait ...");
+    //System.out.println("Loading fulltext from " + url + ". Please wait ...");
    Process p = Runtime.getRuntime().exec(cmd);
    BufferedReader stdInput = new BufferedReader(new
        InputStreamReader(p.getInputStream()));
    while ((s = stdInput.readLine()) != null) {
-        System.out.println(s);
+        //System.out.println(s);
    }
    // Wait for Aperture to finish
    p.waitFor();

    // Parse Aperture XML output
-    DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
-    DocumentBuilder db = dbf.newDocumentBuilder();
-    Document xmlDoc = db.parse(f);
-    NodeList nl = xmlDoc.getElementsByTagName("plainTextContent");
-    if(nl != null && nl.getLength() > 0) {
-        Node node = nl.item(0);
-        if (node.getNodeType() == Node.ELEMENT_NODE) {
-            plainText = plainText + node.getTextContent();
+    try {
+        DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
+        DocumentBuilder db = dbf.newDocumentBuilder();
+        File tempFile = sanitizeApertureOutput(f);
+        Document xmlDoc = db.parse(tempFile);
+        NodeList nl = xmlDoc.getElementsByTagName("plainTextContent");
+        if(nl != null && nl.getLength() > 0) {
+            Node node = nl.item(0);
+            if (node.getNodeType() == Node.ELEMENT_NODE) {
+                plainText = plainText + node.getTextContent();
+            }
        }
-    }

-    String badChars = "[^\\u0009\\u000A\\u000D\\u0020-\\uD7FF\\uE000-\\uFFFD\\u10000-\\u10FFFF]+";
-    plainText = plainText.replaceAll(badChars, " ");
+        plainText = sanitizeFullText(plainText);
+
+        // we'll hold onto the temp file if it failed to parse for debugging;
+        // only set it up to be deleted if we've made it this far successfully.
+        tempFile.deleteOnExit();
+    } catch (Exception e) {
+        indexer.getLogger().error("Error encountered parsing XML Document: " + e);
+    }

    return plainText;
 }
+
+/**
+ * Harvest the contents of a document file (PDF, Word, etc.) using Tika.
+ * This method will only work if Tika is properly configured in the fulltext.ini
+ * file.  Without proper configuration, this will simply return an empty string.
+ *
+ * @param String The url extracted from the MARC tag.
+ * @param String The path to Tika
+ * @return String The full-text
+ */
+public String harvestWithTika(url, scraperPath) {
+    String plainText = "";
+
+    // Construct the command
+    String cmd = "java -jar " + scraperPath + " -t -eUTF8 " + url;
+
+    // Call our scraper
+    //System.out.println("Loading fulltext from " + url + ". Please wait ...");
+    Process p = Runtime.getRuntime().exec(cmd);
+    BufferedReader stdInput = new BufferedReader(new
+        InputStreamReader(p.getInputStream(), "UTF8"));
+
+    // We'll build the string from the command output
+    StringBuilder stringBuilder= new StringBuilder();
+    while ((s = stdInput.readLine()) != null) {
+        stringBuilder.append(s);
+    }
+
+    return sanitizeFullText(stringBuilder.toString());
+}
+
+/**
+ * Harvest the contents of a document file (PDF, Word, etc.) using the active parser.
+ *
+ * @param String The url extracted from the MARC tag.
+ * @param String[] Configuration settings from getFulltextParserSettings.
+ * @return String The full-text
+ */
+public String harvestWithParser(url, settings) {
+    if (settings[0].equals("aperture")) {
+        return harvestWithAperture(url, settings[1]);
+    } else if (settings[0].equals("tika")) {
+        return harvestWithTika(url, settings[1]);
+    }
+    return null;
+}
\ No newline at end of file
--- a/import/log4j.properties
+++ b/import/log4j.properties
@@ -23,7 +23,7 @@ log4j.appender.stdout.target=System.out
 # with a max file size of 100KB
 # and keep 1 previous log file
 log4j.appender.file=org.apache.log4j.RollingFileAppender
-log4j.appender.file.File=solrmarc.log
+log4j.appender.file.File=${one-jar.home.dir}solrmarc.log
 log4j.appender.file.MaxFileSize=100KB
 log4j.appender.file.MaxBackupIndex=1
 log4j.appender.file.layout=org.apache.log4j.PatternLayout

--- a/import/marc.properties
+++ b/import/marc.properties
@@ -83,4 +83,4 @@ oclc_num = 035a, (pattern_map.oclc_num)
 pattern_map.oclc_num.pattern_0 = \\(OCoLC\\)[^0-9]*[0]*([0-9]+)=>$1
 pattern_map.oclc_num.pattern_1 = ocm[0]*([0-9]+)[ ]*[0-9]*=>$1
 pattern_map.oclc_num.pattern_2 = ocn[0]*([0-9]+).*=>$1
-pattern_map.oclc_num.pattern_3 = on[0]*([0-9]+).*=>$1
\ No newline at end of file
+pattern_map.oclc_num.pattern_3 = on[0]*([0-9]+).*=>$1
--- a/import/marc_local.properties
+++ b/import/marc_local.properties
@@ -39,8 +39,9 @@
 # text, HTML, etc.) The first parameter is a fieldspec showing which fields to use
 # for URL retrieval.  The second parameter is optional -- if included, only files
 # matching the specified suffix will be indexed.  Note that this functionality
-# depends on Aperture being installed on your system.  See the wiki for details:
-#       http://vufind.org/wiki/aperture
+# depends on a full text tool being installed on your system.  See the wiki for
+# details:
+#       http://vufind.org/wiki/importing_records#indexing_full_text
 #fulltext = custom, getFulltext(856u, pdf)

 # Uncomment the following line if you want to index latitude/longitude data for
@@ -62,4 +63,4 @@
 #container_issue = 773l
 #container_start_page = 773q
 #container_reference = 773g
-#container_title = 773s
\ No newline at end of file
+#container_title = 773s
--- a/import/translation_maps/getformat_mixin_map.properties
+++ b/import/translation_maps/getformat_mixin_map.properties
@@ -25,12 +25,12 @@ ContentType.Diorama = Physical Object
 ContentType.Filmstrip = Visual Materials
 ContentType.FlashCard = Visual Materials
 ContentType.Game = Physical Object
-ContentType.GovernmentDocumentFederal = Government Document|Gov Doc Fed
-ContentType.GovernmentDocumentState = Government Document|Gov Doc State
-ContentType.GovernmentDocumentStateUniversity = Gov Doc Univ
-ContentType.GovernmentDocumentLocal = Gov Doc Local
-ContentType.GovernmentDocumentInternational = Government Document|Gov Doc Intl
-ContentType.GovernmentDocumentOther = Gov Doc Other
+ContentType.GovernmentDocumentFederal = Government Document
+ContentType.GovernmentDocumentState = Government Document
+ContentType.GovernmentDocumentStateUniversity = null
+ContentType.GovernmentDocumentLocal = null
+ContentType.GovernmentDocumentInternational = Government Document
+ContentType.GovernmentDocumentOther = null
 ContentType.Globe = Map|Globe|Physical Object
 ContentType.Graphic = Visual Materials
 ContentType.Image = Visual Materials
@@ -51,9 +51,9 @@ ContentType.MotionPicture = Film
 ContentType.MusicalScore = Musical Score
 ContentType.MusicalScoreManuscript = Musical Score|Manuscript
 ContentType.MusicRecording = Sound Recording
-ContentType.Newspaper = Journal/Magazine
+ContentType.Newspaper = Journal/Magazine|Newspaper
 ContentType.Pamphlet = Visual Materials
-ContentType.Periodical = Journal/Magazine
+ContentType.Periodical = Journal/Magazine|Periodical
 ContentType.PhysicalObject = Physical Object
 ContentType.Picture = Visual Materials
 ContentType.ProjectedMedium = Visual Materials
@@ -76,6 +76,7 @@ ContentType.Website = Online|Computer Resource
 MediaType.ActivityCard = Visual Materials
 MediaType.Atlas = Map|Atlas
 MediaType.Braille = Braille
+MediaType.Broadside = Broadside
 MediaType.Chart = Visual Materials
 MediaType.Collage = Visual Materials
 MediaType.ComputerCard = Computer Media
@@ -154,6 +155,7 @@ MediaType.SoundDisc = null
 MediaType.SoundDiscCD = CD
 MediaType.SoundDiscLP = LP
 MediaType.SoundRecordingOther = Other Media
+MediaType.SoundRecordingOnline = Streaming Audio
 MediaType.SoundRoll = Roll
 MediaType.SoundTapeReel = Tape Reel
 MediaType.SoundTrackFilm = Sound Track Film
@@ -204,8 +206,8 @@ FormOfItem.Microopaque = Microform
 FormOfItem.PrintLarge = Large Print
 FormOfItem.Braille = Braille
 FormOfItem.Online = Online
-FormOfItem.ElectronicDirect = Computer Resource
-FormOfItem.Electronic = Computer Resource
+FormOfItem.ElectronicDirect = null
+FormOfItem.Electronic = null
 FormOfItem.Print = Print

 #CombinedType

--- a/import/translation_maps/getformat_mixin_unmap_map.properties
+++ b/import/translation_maps/getformat_mixin_unmap_map.properties
@@ -76,6 +76,7 @@ ContentType.Website = (Leader[67]=as OR 006[0]=s) AND (008[21]=w OR 006[4]=w)
 MediaType.ActivityCard = 007[01]=ka
 MediaType.Atlas = 007[01]=ad
 MediaType.Braille = 007[01]=fb OR 007[01]=tc
+MediaType.Broadside = Heuristic (300 field)
 MediaType.Chart = 007[01]=kn
 MediaType.Collage = 007[01]=kc
 MediaType.ComputerCard = 007[01]=ck
@@ -156,6 +157,7 @@ MediaType.SoundDisc = 007[01]=sd
 MediaType.SoundDiscCD = 007[01]=sd AND 007[3]=f
 MediaType.SoundDiscLP = 007[01]=sd AND 007[3]=abde
 MediaType.SoundRecordingOther = 007[0]=s 007[1]!={cdefgirqstw}
+MediaType.SoundRecordingOnline = 007[0]=s 007[1]=z AND Online
 MediaType.SoundRoll = 007[01]=sq
 MediaType.SoundTapeReel = 007[01]=st
 MediaType.SoundTrackFilm = 007[01]=si