Skip to content
Snippets Groups Projects
Commit 285f5c02 authored by Demian Katz's avatar Demian Katz
Browse files

Improved image extraction for some non-English languages.

parent e16ea898
No related merge requests found
...@@ -110,7 +110,7 @@ class Wikipedia implements TranslatorAwareInterface ...@@ -110,7 +110,7 @@ class Wikipedia implements TranslatorAwareInterface
*/ */
public function setLanguage($lang) public function setLanguage($lang)
{ {
$this->lang = $lang; $this->lang = substr($lang, 0, 2); // strip off regional suffixes
} }
/** /**
...@@ -172,7 +172,8 @@ class Wikipedia implements TranslatorAwareInterface ...@@ -172,7 +172,8 @@ class Wikipedia implements TranslatorAwareInterface
$imageName = $imageCaption = null; $imageName = $imageCaption = null;
// Get rid of the last pair of braces and split // Get rid of the last pair of braces and split
$infobox = explode("\n|", substr($infoboxStr, 2, -2)); $infobox = explode("\n|", preg_replace('/^\s+|/m', '', substr($infoboxStr, 2, -2)));
// Look through every row of the infobox // Look through every row of the infobox
foreach ($infobox as $row) { foreach ($infobox as $row) {
$data = explode("=", $row); $data = explode("=", $row);
...@@ -185,11 +186,16 @@ class Wikipedia implements TranslatorAwareInterface ...@@ -185,11 +186,16 @@ class Wikipedia implements TranslatorAwareInterface
case "image": case "image":
case "image:": case "image:":
case "image_name": case "image_name":
case "imagem":
case 'imagen':
case 'immagine':
$imageName = str_replace(' ', '_', $value); $imageName = str_replace(' ', '_', $value);
break; break;
case "caption": case "caption":
case "img_capt": case "img_capt":
case "image_caption": case "image_caption":
case "legenda":
case 'textoimagen':
$imageCaption = $value; $imageCaption = $value;
break; break;
default: default:
...@@ -213,11 +219,17 @@ class Wikipedia implements TranslatorAwareInterface ...@@ -213,11 +219,17 @@ class Wikipedia implements TranslatorAwareInterface
// We are looking for the infobox inside "{{...}}" // We are looking for the infobox inside "{{...}}"
// It may contain nested blocks too, thus the recursion // It may contain nested blocks too, thus the recursion
preg_match_all('/\{([^{}]++|(?R))*\}/s', $body['*'], $matches); preg_match_all('/\{([^{}]++|(?R))*\}/s', $body['*'], $matches);
foreach ($matches[1] as $m) { foreach ($matches[1] as $m) {
// If this is the Infobox // Check if this is the Infobox; name may vary by language
if (substr($m, 0, 8) == "{Infobox") { $infoboxTags = array(
// Keep the string for later, we need the body block that follows it 'Bio', 'Ficha de escritor', 'Infobox', 'Info/Biografia'
return "{".$m."}"; );
foreach ($infoboxTags as $tag) {
if (substr($m, 0, strlen($tag) + 1) == '{' . $tag) {
// We found an infobox!!
return "{".$m."}";
}
} }
} }
...@@ -234,10 +246,16 @@ class Wikipedia implements TranslatorAwareInterface ...@@ -234,10 +246,16 @@ class Wikipedia implements TranslatorAwareInterface
protected function extractImageFromBody($body) protected function extractImageFromBody($body)
{ {
$imageName = $imageCaption = null; $imageName = $imageCaption = null;
$pattern = '/(\x5b\x5b)Image:([^\x5d]*)(\x5d\x5d)/U'; // The tag marking image files will vary depending on API language:
$tags = array(
'Archivo', 'Bestand', 'Datei', 'Ficheiro', 'Fichier', 'File', 'Image'
);
$pattern = '/(\x5b\x5b)('
. implode('|', $tags)
. '):([^\x5d]*\.jpg[^\x5d]*)(\x5d\x5d)/U';
preg_match_all($pattern, $body['*'], $matches); preg_match_all($pattern, $body['*'], $matches);
if (isset($matches[2][0])) { if (isset($matches[3][0])) {
$parts = explode('|', $matches[2][0]); $parts = explode('|', $matches[3][0]);
$imageName = str_replace(' ', '_', $parts[0]); $imageName = str_replace(' ', '_', $parts[0]);
if (count($parts) > 1) { if (count($parts) > 1) {
$imageCaption = strip_tags( $imageCaption = strip_tags(
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment