From dbb4f92ba06537cdbaac6b3282a31527d02d40bd Mon Sep 17 00:00:00 2001
From: Demian Katz <demian.katz@villanova.edu>
Date: Fri, 8 Nov 2013 08:10:08 -0500
Subject: [PATCH] More refactoring/simplification.

---
 .../src/VuFind/Recommend/AuthorInfo.php       | 87 ++++++++++---------
 1 file changed, 45 insertions(+), 42 deletions(-)

diff --git a/module/VuFind/src/VuFind/Recommend/AuthorInfo.php b/module/VuFind/src/VuFind/Recommend/AuthorInfo.php
index d8f6a23243d..0bd6caecfda 100644
--- a/module/VuFind/src/VuFind/Recommend/AuthorInfo.php
+++ b/module/VuFind/src/VuFind/Recommend/AuthorInfo.php
@@ -341,6 +341,45 @@ class AuthorInfo implements RecommendInterface, TranslatorAwareInterface
         return array($imageName, $imageCaption);
     }
 
+    /**
+     * Support method for sanitizeWikipediaBody -- strip image/file links.
+     *
+     * @param string $body The Wikipedia response to sanitize
+     *
+     * @return string
+     */
+    protected function stripImageAndFileLinks($body)
+    {
+        // Remove unwanted image/file links
+        // Nested brackets make this annoying: We can't add 'File' or 'Image' as
+        //    mandatory because the recursion fails, or as optional because then
+        //    normal links get hit.
+        //    ... unless there's a better pattern? TODO
+        // eg. [[File:Johann Sebastian Bach.jpg|thumb|Bach in a 1748 portrait by
+        //     [[Elias Gottlob Haussmann|Haussmann]]]]
+        $open    = "\\[";
+        $close   = "\\]";
+        $content = "(?>[^\\[\\]]+)";  // Anything but [ or ]
+        // We can either find content or recursive brackets:
+        $recursive_match = "($content|(?R))*";
+        $body .= "[[file:bad]]";
+        preg_match_all("/".$open.$recursive_match.$close."/Us", $body, $new_matches);
+        // Loop through every match (link) we found
+        if (is_array($new_matches)) {
+            foreach ($new_matches as $nm) {
+                foreach ((array)$nm as $n) {
+                    // If it's a file link get rid of it
+                    if (strtolower(substr($n, 0, 7)) == "[[file:"
+                        || strtolower(substr($n, 0, 8)) == "[[image:"
+                    ) {
+                        $body = str_replace($n, "", $body);
+                    }
+                }
+            }
+        }
+        return $body;
+    }
+
     /**
      * Support method for parseWikipedia - fix up details in the body
      *
@@ -350,6 +389,12 @@ class AuthorInfo implements RecommendInterface, TranslatorAwareInterface
      */
     protected function sanitizeWikipediaBody($body)
     {
+        // Cull our content back to everything before the first heading
+        $body = trim(substr($body, 0, strpos($body, "==")));
+
+        // Strip out links
+        $body = $this->stripImageAndFileLinks($body);
+
         // Initialize arrays of processing instructions
         $pattern = array();
         $replacement = array();
@@ -472,48 +517,6 @@ class AuthorInfo implements RecommendInterface, TranslatorAwareInterface
             // No infobox -- use whole thing:
             $body = $body['*'];
         }
-        // Find the first heading
-        $end    = strpos($body, "==");
-        // Now cull our content back to everything before the first heading
-        $body   = trim(substr($body, 0, $end));
-
-        // Remove unwanted image/file links
-        // Nested brackets make this annoying: We can't add 'File' or 'Image' as
-        //    mandatory because the recursion fails, or as optional because then
-        //    normal links get hit.
-        //    ... unless there's a better pattern? TODO
-        // eg. [[File:Johann Sebastian Bach.jpg|thumb|Bach in a 1748 portrait by
-        //     [[Elias Gottlob Haussmann|Haussmann]]]]
-        $open    = "\\[";
-        $close   = "\\]";
-        $content = "(?>[^\\[\\]]+)";  // Anything but [ or ]
-        // We can either find content or recursive brackets:
-        $recursive_match = "($content|(?R))*";
-        preg_match_all("/".$open.$recursive_match.$close."/Us", $body, $new_matches);
-        // Loop through every match (link) we found
-        if (is_array($new_matches)) {
-            foreach ($new_matches as $nm) {
-                // Might be an array of arrays
-                if (is_array($nm)) {
-                    foreach ($nm as $n) {
-                        // If it's a file link get rid of it
-                        if (strtolower(substr($n, 0, 7)) == "[[file:"
-                            || strtolower(substr($n, 0, 8)) == "[[image:"
-                        ) {
-                            $body = str_replace($n, "", $body);
-                        }
-                    }
-                } else {
-                    // Or just a normal array...
-                    // If it's a file link get rid of it
-                    if (strtolower(substr($nm, 0, 7)) == "[[file:"
-                        || strtolower(substr($nm, 0, 8)) == "[[image:"
-                    ) {
-                        $body = str_replace($nm, "", $body);
-                    }
-                }
-            }
-        }
 
         if (isset($imageUrl) && $imageUrl != false) {
             $info['image'] = $imageUrl;
-- 
GitLab