From 2fe79719f8b83ebc4701a1c0fb4ca232de9b2aea Mon Sep 17 00:00:00 2001 From: Demian Katz <demian.katz@villanova.edu> Date: Wed, 23 Jan 2013 09:42:58 -0500 Subject: [PATCH] Resolving VUFIND-739 (Wikipedia circular redirect causes infinite loop). --- .../src/VuFind/Recommend/AuthorInfo.php | 62 +++++++++++++++---- 1 file changed, 51 insertions(+), 11 deletions(-) diff --git a/module/VuFind/src/VuFind/Recommend/AuthorInfo.php b/module/VuFind/src/VuFind/Recommend/AuthorInfo.php index de0e938429c..e0ee8f14543 100644 --- a/module/VuFind/src/VuFind/Recommend/AuthorInfo.php +++ b/module/VuFind/src/VuFind/Recommend/AuthorInfo.php @@ -86,6 +86,13 @@ class AuthorInfo implements RecommendInterface, TranslatorAwareInterface */ protected $useViaf = false; + /** + * Log of Wikipedia pages already retrieved + * + * @var array + */ + protected $pagesRetrieved = array(); + /** * Constructor * @@ -179,6 +186,23 @@ class AuthorInfo implements RecommendInterface, TranslatorAwareInterface $this->searchObject = $results; } + /** + * Check if a page has already been retrieved; if it hasn't, flag it as + * retrieved for future reference. + * + * @param string $author Author being retrieved + * + * @return bool + */ + protected function alreadyRetrieved($author) + { + if (isset($this->pagesRetrieved[$author])) { + return true; + } + $this->pagesRetrieved[$author] = true; + return false; + } + /** * Returns info from Wikipedia to the view * @@ -218,6 +242,12 @@ class AuthorInfo implements RecommendInterface, TranslatorAwareInterface */ protected function getWikipedia($author) { + // Don't retrieve the same page multiple times; this indicates a loop + // that needs to be broken! + if ($this->alreadyRetrieved($author)) { + return array(); + } + // Get information from Wikipedia API $uri = 'http://' . $this->lang . '.wikipedia.org/w/api.php' . '?action=query&prop=revisions&rvprop=content&format=php' . @@ -248,18 +278,28 @@ class AuthorInfo implements RecommendInterface, TranslatorAwareInterface return null; } - // Get the default page - $body = array_shift($body['query']['pages']); - $info = array('name' => $body['title'], 'wiki_lang' => $this->lang); - - // Get the latest revision - $body = array_shift($body['revisions']); - // Check for redirection - $as_lines = explode("\n", $body['*']); - if (stristr($as_lines[0], '#REDIRECT')) { - preg_match('/\[\[(.*)\]\]/', $as_lines[0], $matches); - return $this->getWikipedia($matches[1]); + // Loop through the pages and find the first that isn't a redirect: + foreach ($body['query']['pages'] as $page) { + $info['name'] = $page['title']; + + // Get the latest revision + $page = array_shift($page['revisions']); + // Check for redirection + $as_lines = explode("\n", $page['*']); + if (stristr($as_lines[0], '#REDIRECT')) { + preg_match('/\[\[(.*)\]\]/', $as_lines[0], $matches); + $redirectTo = $matches[1]; + } else { + $redirectTo = false; + break; + } + } + + // Recurse if we only found redirects: + if ($redirectTo) { + return $this->getWikipedia($redirectTo); } + $body = $page; /* Infobox */ -- GitLab