diff --git a/import/SolrMarc.jar b/import/SolrMarc.jar index 6e0dc8d1c7de1a80977ab92e1581abfd3783d864..9d3c027e238966c58e1c122af464239443e41bb0 100644 Binary files a/import/SolrMarc.jar and b/import/SolrMarc.jar differ diff --git a/import/VuFindIndexer.jar b/import/VuFindIndexer.jar index 31506f82194ba0e25982848df5904856249ff7fd..96092ab716c3de9efdf5773774de696c9f3e7123 100644 Binary files a/import/VuFindIndexer.jar and b/import/VuFindIndexer.jar differ diff --git a/import/index_scripts/getFulltext.bsh b/import/index_scripts/getFulltext.bsh index ab66f834a7caf8b434b74811c80f712f90b72765..c58f8e5ff309b863c8f726056415fe50708e7679 100644 --- a/import/index_scripts/getFulltext.bsh +++ b/import/index_scripts/getFulltext.bsh @@ -9,7 +9,6 @@ import org.marc4j.marc.Record; import org.marc4j.marc.DataField; import java.util.regex.Pattern; import java.io.*; -import org.ini4j.Ini; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import org.w3c.dom.Document; @@ -17,6 +16,53 @@ import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; +// define the base level indexer so that its methods can be called from the script. +// note that the SolrIndexer code will set this value before the script methods are called. +org.solrmarc.index.SolrIndexer indexer = null; + +/** + * Load configurations for the full text parser. Return an array containing the + * parser type in the first element and the parser configuration in the second + * element. + * + * @return String[] + */ +public String[] getFulltextParserSettings() +{ + String parserType = indexer.getConfigSetting( + "fulltext.ini", "General", "parser" + ); + if (null != parserType) { + parserType = parserType.toLowerCase(); + } + + // Is Aperture active? + String aperturePath = indexer.getConfigSetting( + "fulltext.ini", "Aperture", "webcrawler" + ); + if ((null == parserType && null != aperturePath) + || (null != parserType && parserType.equals("aperture")) + ) { + String[] array = { "aperture", aperturePath }; + return array; + } + + // Is Tika active? + String tikaPath = indexer.getConfigSetting( + "fulltext.ini", "Tika", "path" + ); + if ((null == parserType && null != tikaPath) + || (null != parserType && parserType.equals("tika")) + ) { + String[] array = { "tika", tikaPath }; + return array; + } + + // No recognized parser found: + String[] array = { "none", null }; + return array; +} + /** * Extract full-text from the documents referenced in the tags * @@ -28,9 +74,9 @@ import org.w3c.dom.NodeList; public String getFulltext(Record record, String fieldSpec, String extension) { String result = ""; - // Get the path to Aperture web crawler (and return no text if it is unavailable) - String aperturePath = getAperturePath(); - if (aperturePath == null) { + // Get the web crawler settings (and return no text if it is unavailable) + String[] parserSettings = getFulltextParserSettings(); + if (parserSettings[0].equals("none")) { return null; } @@ -43,8 +89,8 @@ public String getFulltext(Record record, String fieldSpec, String extension) { String current = fieldsIter.next(); // Filter by file extension if (extension == null || current.endsWith(extension)) { - // Load the aperture output for each tag into a string - result = result + harvestWithAperture(current, aperturePath); + // Load the parser output for each tag into a string + result = result + harvestWithParser(current, parserSettings); } } } @@ -74,57 +120,48 @@ public String getFulltext(Record record) { } /** - * Extract the Aperture path from fulltext.ini + * Clean up XML data generated by Aperture * - * @return String Path to Aperture executables + * @param File The file to clean + * @return File A fixed version of the file */ -public String getAperturePath() { - // Obtain path to Aperture from the fulltext.ini file: - Ini ini = new Ini(); - - // Find VuFind's home directory in the environment; if it's not available, - // try using a relative path on the assumption that we are currently in - // VuFind's root directory: - String vufindHome = System.getenv("VUFIND_HOME"); - if (vufindHome == null) { - vufindHome = ""; - } +public File sanitizeApertureOutput(File f) +{ + //clean up the aperture xml output + File tempFile = File.createTempFile("buffer", ".tmp"); + FileOutputStream fw = new FileOutputStream(tempFile); + Writer writer = new OutputStreamWriter(fw, "UTF8"); - // TODO: update this to work with 2.0 config paths - String fulltextIniFile = vufindHome + "/web/conf/fulltext.ini"; - File file = new File(fulltextIniFile); - try { - ini.load(new FileReader(fulltextIniFile)); - } catch (Throwable e) { - dieWithError("Unable to access " + fulltextIniFile); - } - String aperturePath = ini.get("Aperture", "webcrawler"); - if (aperturePath == null) { - return null; - } - - // Drop comments if necessary: - int pos = aperturePath.indexOf(';'); - if (pos >= 0) { - aperturePath = aperturePath.substring(0, pos).trim(); + //delete this control character from the File and save + Reader fr = new FileReader(f); + BufferedReader br = new BufferedReader(fr); + while (br.ready()) { + writer.write(br.readLine().replaceAll("\u000C","")); } + writer.close(); + br.close(); + fr.close(); - // Strip wrapping quotes if necessary (the ini reader won't do this for us): - if (aperturePath.startsWith("\"")) { - aperturePath = aperturePath.substring(1, aperturePath.length()); - } - if (aperturePath.endsWith("\"")) { - aperturePath = aperturePath.substring(0, aperturePath.length() - 1); - } + return tempFile; +} - return aperturePath; +/** + * Clean up bad characters in the full text. + * + * @param String Text to clean + * @return String Cleaned text + */ +public String sanitizeFullText(text) +{ + String badChars = "[^\\u0009\\u000A\\u000D\\u0020-\\uD7FF\\uE000-\\uFFFD\\u10000-\\u10FFFF]+"; + return text.replaceAll(badChars, " "); } /** * Harvest the contents of a document file (PDF, Word, etc.) using Aperture. * This method will only work if Aperture is properly configured in the - * web/conf/fulltext.ini file. Without proper configuration, this will - * simply return an empty string. + * fulltext.ini file. Without proper configuration, this will simply return an + * empty string. * * @param String The url extracted from the MARC tag. * @param String The path to Aperture @@ -142,30 +179,84 @@ public String harvestWithAperture(url, aperturePath) { String cmd = aperturePath + " -o " + f.getAbsolutePath().toString() + " -x " + url; // Call Aperture - System.out.println("Loading fulltext from " + url + ". Please wait ..."); + //System.out.println("Loading fulltext from " + url + ". Please wait ..."); Process p = Runtime.getRuntime().exec(cmd); BufferedReader stdInput = new BufferedReader(new InputStreamReader(p.getInputStream())); while ((s = stdInput.readLine()) != null) { - System.out.println(s); + //System.out.println(s); } // Wait for Aperture to finish p.waitFor(); // Parse Aperture XML output - DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); - DocumentBuilder db = dbf.newDocumentBuilder(); - Document xmlDoc = db.parse(f); - NodeList nl = xmlDoc.getElementsByTagName("plainTextContent"); - if(nl != null && nl.getLength() > 0) { - Node node = nl.item(0); - if (node.getNodeType() == Node.ELEMENT_NODE) { - plainText = plainText + node.getTextContent(); + try { + DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); + DocumentBuilder db = dbf.newDocumentBuilder(); + File tempFile = sanitizeApertureOutput(f); + Document xmlDoc = db.parse(tempFile); + NodeList nl = xmlDoc.getElementsByTagName("plainTextContent"); + if(nl != null && nl.getLength() > 0) { + Node node = nl.item(0); + if (node.getNodeType() == Node.ELEMENT_NODE) { + plainText = plainText + node.getTextContent(); + } } - } - String badChars = "[^\\u0009\\u000A\\u000D\\u0020-\\uD7FF\\uE000-\\uFFFD\\u10000-\\u10FFFF]+"; - plainText = plainText.replaceAll(badChars, " "); + plainText = sanitizeFullText(plainText); + + // we'll hold onto the temp file if it failed to parse for debugging; + // only set it up to be deleted if we've made it this far successfully. + tempFile.deleteOnExit(); + } catch (Exception e) { + indexer.getLogger().error("Error encountered parsing XML Document: " + e); + } return plainText; } + +/** + * Harvest the contents of a document file (PDF, Word, etc.) using Tika. + * This method will only work if Tika is properly configured in the fulltext.ini + * file. Without proper configuration, this will simply return an empty string. + * + * @param String The url extracted from the MARC tag. + * @param String The path to Tika + * @return String The full-text + */ +public String harvestWithTika(url, scraperPath) { + String plainText = ""; + + // Construct the command + String cmd = "java -jar " + scraperPath + " -t -eUTF8 " + url; + + // Call our scraper + //System.out.println("Loading fulltext from " + url + ". Please wait ..."); + Process p = Runtime.getRuntime().exec(cmd); + BufferedReader stdInput = new BufferedReader(new + InputStreamReader(p.getInputStream(), "UTF8")); + + // We'll build the string from the command output + StringBuilder stringBuilder= new StringBuilder(); + while ((s = stdInput.readLine()) != null) { + stringBuilder.append(s); + } + + return sanitizeFullText(stringBuilder.toString()); +} + +/** + * Harvest the contents of a document file (PDF, Word, etc.) using the active parser. + * + * @param String The url extracted from the MARC tag. + * @param String[] Configuration settings from getFulltextParserSettings. + * @return String The full-text + */ +public String harvestWithParser(url, settings) { + if (settings[0].equals("aperture")) { + return harvestWithAperture(url, settings[1]); + } else if (settings[0].equals("tika")) { + return harvestWithTika(url, settings[1]); + } + return null; +} \ No newline at end of file diff --git a/import/log4j.properties b/import/log4j.properties index e0a8fbefc77614e987e1f1405fd6a356f6ab6096..2fd730aefb4db06c51f734864c85c57451644f5d 100644 --- a/import/log4j.properties +++ b/import/log4j.properties @@ -23,7 +23,7 @@ log4j.appender.stdout.target=System.out # with a max file size of 100KB # and keep 1 previous log file log4j.appender.file=org.apache.log4j.RollingFileAppender -log4j.appender.file.File=solrmarc.log +log4j.appender.file.File=${one-jar.home.dir}solrmarc.log log4j.appender.file.MaxFileSize=100KB log4j.appender.file.MaxBackupIndex=1 log4j.appender.file.layout=org.apache.log4j.PatternLayout diff --git a/import/marc.properties b/import/marc.properties index fd088c9b109c75e32dd4894640609d503a62527e..4d8227d7832df97afaede2752cda0d59e9de5371 100644 --- a/import/marc.properties +++ b/import/marc.properties @@ -83,4 +83,4 @@ oclc_num = 035a, (pattern_map.oclc_num) pattern_map.oclc_num.pattern_0 = \\(OCoLC\\)[^0-9]*[0]*([0-9]+)=>$1 pattern_map.oclc_num.pattern_1 = ocm[0]*([0-9]+)[ ]*[0-9]*=>$1 pattern_map.oclc_num.pattern_2 = ocn[0]*([0-9]+).*=>$1 -pattern_map.oclc_num.pattern_3 = on[0]*([0-9]+).*=>$1 \ No newline at end of file +pattern_map.oclc_num.pattern_3 = on[0]*([0-9]+).*=>$1 diff --git a/import/marc_local.properties b/import/marc_local.properties index 5a581716289adc5980e73d1066d00956d0ca8073..7e064492b1ae4bd77ef173b10b973910627728f9 100644 --- a/import/marc_local.properties +++ b/import/marc_local.properties @@ -39,8 +39,9 @@ # text, HTML, etc.) The first parameter is a fieldspec showing which fields to use # for URL retrieval. The second parameter is optional -- if included, only files # matching the specified suffix will be indexed. Note that this functionality -# depends on Aperture being installed on your system. See the wiki for details: -# http://vufind.org/wiki/aperture +# depends on a full text tool being installed on your system. See the wiki for +# details: +# http://vufind.org/wiki/importing_records#indexing_full_text #fulltext = custom, getFulltext(856u, pdf) # Uncomment the following line if you want to index latitude/longitude data for @@ -62,4 +63,4 @@ #container_issue = 773l #container_start_page = 773q #container_reference = 773g -#container_title = 773s \ No newline at end of file +#container_title = 773s diff --git a/import/translation_maps/getformat_mixin_map.properties b/import/translation_maps/getformat_mixin_map.properties index ecbc9931f563c77d333bc43b817143a924056075..941ebcd2550ba9f2be96413f6693a9999822a4d1 100644 --- a/import/translation_maps/getformat_mixin_map.properties +++ b/import/translation_maps/getformat_mixin_map.properties @@ -25,12 +25,12 @@ ContentType.Diorama = Physical Object ContentType.Filmstrip = Visual Materials ContentType.FlashCard = Visual Materials ContentType.Game = Physical Object -ContentType.GovernmentDocumentFederal = Government Document|Gov Doc Fed -ContentType.GovernmentDocumentState = Government Document|Gov Doc State -ContentType.GovernmentDocumentStateUniversity = Gov Doc Univ -ContentType.GovernmentDocumentLocal = Gov Doc Local -ContentType.GovernmentDocumentInternational = Government Document|Gov Doc Intl -ContentType.GovernmentDocumentOther = Gov Doc Other +ContentType.GovernmentDocumentFederal = Government Document +ContentType.GovernmentDocumentState = Government Document +ContentType.GovernmentDocumentStateUniversity = null +ContentType.GovernmentDocumentLocal = null +ContentType.GovernmentDocumentInternational = Government Document +ContentType.GovernmentDocumentOther = null ContentType.Globe = Map|Globe|Physical Object ContentType.Graphic = Visual Materials ContentType.Image = Visual Materials @@ -51,9 +51,9 @@ ContentType.MotionPicture = Film ContentType.MusicalScore = Musical Score ContentType.MusicalScoreManuscript = Musical Score|Manuscript ContentType.MusicRecording = Sound Recording -ContentType.Newspaper = Journal/Magazine +ContentType.Newspaper = Journal/Magazine|Newspaper ContentType.Pamphlet = Visual Materials -ContentType.Periodical = Journal/Magazine +ContentType.Periodical = Journal/Magazine|Periodical ContentType.PhysicalObject = Physical Object ContentType.Picture = Visual Materials ContentType.ProjectedMedium = Visual Materials @@ -76,6 +76,7 @@ ContentType.Website = Online|Computer Resource MediaType.ActivityCard = Visual Materials MediaType.Atlas = Map|Atlas MediaType.Braille = Braille +MediaType.Broadside = Broadside MediaType.Chart = Visual Materials MediaType.Collage = Visual Materials MediaType.ComputerCard = Computer Media @@ -154,6 +155,7 @@ MediaType.SoundDisc = null MediaType.SoundDiscCD = CD MediaType.SoundDiscLP = LP MediaType.SoundRecordingOther = Other Media +MediaType.SoundRecordingOnline = Streaming Audio MediaType.SoundRoll = Roll MediaType.SoundTapeReel = Tape Reel MediaType.SoundTrackFilm = Sound Track Film @@ -204,8 +206,8 @@ FormOfItem.Microopaque = Microform FormOfItem.PrintLarge = Large Print FormOfItem.Braille = Braille FormOfItem.Online = Online -FormOfItem.ElectronicDirect = Computer Resource -FormOfItem.Electronic = Computer Resource +FormOfItem.ElectronicDirect = null +FormOfItem.Electronic = null FormOfItem.Print = Print #CombinedType diff --git a/import/translation_maps/getformat_mixin_unmap_map.properties b/import/translation_maps/getformat_mixin_unmap_map.properties index 6a967b53aafd9492a9ee4f903e83195f7de413fd..dad1b5dd2ab50a607ca58ec7569ce0fe136cd2b2 100644 --- a/import/translation_maps/getformat_mixin_unmap_map.properties +++ b/import/translation_maps/getformat_mixin_unmap_map.properties @@ -76,6 +76,7 @@ ContentType.Website = (Leader[67]=as OR 006[0]=s) AND (008[21]=w OR 006[4]=w) MediaType.ActivityCard = 007[01]=ka MediaType.Atlas = 007[01]=ad MediaType.Braille = 007[01]=fb OR 007[01]=tc +MediaType.Broadside = Heuristic (300 field) MediaType.Chart = 007[01]=kn MediaType.Collage = 007[01]=kc MediaType.ComputerCard = 007[01]=ck @@ -156,6 +157,7 @@ MediaType.SoundDisc = 007[01]=sd MediaType.SoundDiscCD = 007[01]=sd AND 007[3]=f MediaType.SoundDiscLP = 007[01]=sd AND 007[3]=abde MediaType.SoundRecordingOther = 007[0]=s 007[1]!={cdefgirqstw} +MediaType.SoundRecordingOnline = 007[0]=s 007[1]=z AND Online MediaType.SoundRoll = 007[01]=sq MediaType.SoundTapeReel = 007[01]=st MediaType.SoundTrackFilm = 007[01]=si