From dbb4f92ba06537cdbaac6b3282a31527d02d40bd Mon Sep 17 00:00:00 2001 From: Demian Katz <demian.katz@villanova.edu> Date: Fri, 8 Nov 2013 08:10:08 -0500 Subject: [PATCH] More refactoring/simplification. --- .../src/VuFind/Recommend/AuthorInfo.php | 87 ++++++++++--------- 1 file changed, 45 insertions(+), 42 deletions(-) diff --git a/module/VuFind/src/VuFind/Recommend/AuthorInfo.php b/module/VuFind/src/VuFind/Recommend/AuthorInfo.php index d8f6a23243d..0bd6caecfda 100644 --- a/module/VuFind/src/VuFind/Recommend/AuthorInfo.php +++ b/module/VuFind/src/VuFind/Recommend/AuthorInfo.php @@ -341,6 +341,45 @@ class AuthorInfo implements RecommendInterface, TranslatorAwareInterface return array($imageName, $imageCaption); } + /** + * Support method for sanitizeWikipediaBody -- strip image/file links. + * + * @param string $body The Wikipedia response to sanitize + * + * @return string + */ + protected function stripImageAndFileLinks($body) + { + // Remove unwanted image/file links + // Nested brackets make this annoying: We can't add 'File' or 'Image' as + // mandatory because the recursion fails, or as optional because then + // normal links get hit. + // ... unless there's a better pattern? TODO + // eg. [[File:Johann Sebastian Bach.jpg|thumb|Bach in a 1748 portrait by + // [[Elias Gottlob Haussmann|Haussmann]]]] + $open = "\\["; + $close = "\\]"; + $content = "(?>[^\\[\\]]+)"; // Anything but [ or ] + // We can either find content or recursive brackets: + $recursive_match = "($content|(?R))*"; + $body .= "[[file:bad]]"; + preg_match_all("/".$open.$recursive_match.$close."/Us", $body, $new_matches); + // Loop through every match (link) we found + if (is_array($new_matches)) { + foreach ($new_matches as $nm) { + foreach ((array)$nm as $n) { + // If it's a file link get rid of it + if (strtolower(substr($n, 0, 7)) == "[[file:" + || strtolower(substr($n, 0, 8)) == "[[image:" + ) { + $body = str_replace($n, "", $body); + } + } + } + } + return $body; + } + /** * Support method for parseWikipedia - fix up details in the body * @@ -350,6 +389,12 @@ class AuthorInfo implements RecommendInterface, TranslatorAwareInterface */ protected function sanitizeWikipediaBody($body) { + // Cull our content back to everything before the first heading + $body = trim(substr($body, 0, strpos($body, "=="))); + + // Strip out links + $body = $this->stripImageAndFileLinks($body); + // Initialize arrays of processing instructions $pattern = array(); $replacement = array(); @@ -472,48 +517,6 @@ class AuthorInfo implements RecommendInterface, TranslatorAwareInterface // No infobox -- use whole thing: $body = $body['*']; } - // Find the first heading - $end = strpos($body, "=="); - // Now cull our content back to everything before the first heading - $body = trim(substr($body, 0, $end)); - - // Remove unwanted image/file links - // Nested brackets make this annoying: We can't add 'File' or 'Image' as - // mandatory because the recursion fails, or as optional because then - // normal links get hit. - // ... unless there's a better pattern? TODO - // eg. [[File:Johann Sebastian Bach.jpg|thumb|Bach in a 1748 portrait by - // [[Elias Gottlob Haussmann|Haussmann]]]] - $open = "\\["; - $close = "\\]"; - $content = "(?>[^\\[\\]]+)"; // Anything but [ or ] - // We can either find content or recursive brackets: - $recursive_match = "($content|(?R))*"; - preg_match_all("/".$open.$recursive_match.$close."/Us", $body, $new_matches); - // Loop through every match (link) we found - if (is_array($new_matches)) { - foreach ($new_matches as $nm) { - // Might be an array of arrays - if (is_array($nm)) { - foreach ($nm as $n) { - // If it's a file link get rid of it - if (strtolower(substr($n, 0, 7)) == "[[file:" - || strtolower(substr($n, 0, 8)) == "[[image:" - ) { - $body = str_replace($n, "", $body); - } - } - } else { - // Or just a normal array... - // If it's a file link get rid of it - if (strtolower(substr($nm, 0, 7)) == "[[file:" - || strtolower(substr($nm, 0, 8)) == "[[image:" - ) { - $body = str_replace($nm, "", $body); - } - } - } - } if (isset($imageUrl) && $imageUrl != false) { $info['image'] = $imageUrl; -- GitLab