diff --git a/link_preview/linkinfo.php b/link_preview/linkinfo.php index 1a4e0a7..09f914b 100644 --- a/link_preview/linkinfo.php +++ b/link_preview/linkinfo.php @@ -1,109 +1,111 @@ initialized - // 1 => fetched - // 2 => parsed - - private $ch; - private $doc; - - private $metaAttributes = array('author', 'description', 'keywords', 'date', 'generator'); - private $otherTagNames = array('h1', 'h2', 'h3'); - - private $ua = 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3'; - - function __construct($url) { + protected $info; + protected $state = -1; // 0 => initialized, 1 => fetched, 2 => parsed + + protected $ch; + protected $doc; + + protected $metaAttributes = array('author', 'description', 'keywords', 'date', 'generator'); + protected $otherTagNames = array('h1', 'h2', 'h3'); + + protected $ua = 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3'; + + public function __construct($url) { $this->info['originalUrl'] = $url; } - - function __get($member) { + + public function __get($member) { if ($this->state < 2) { $this->parse(); } - return $this->info[$member]; } - - function __isset($member) { + + public function __isset($member) { return isset($this->info[$member]); } - - function get() { + + public function get() { if ($this->state < 2) { $this->parse(); } - return $this->info; } - - private function init() { + + protected function init() { if (!empty($this->info['originalUrl'])) { // TODO: implement url regex checks $this->ch = curl_init(); curl_setopt($this->ch, CURLOPT_URL, $this->info['originalUrl']); curl_setopt($this->ch, CURLOPT_HEADER, false); + curl_setopt($this->ch, CURLOPT_CONNECTTIMEOUT, 1); curl_setopt($this->ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($this->ch, CURLINFO_HEADER_OUT, true); curl_setopt($this->ch, CURLOPT_USERAGENT, $this->ua); - + echo 'request: ' . $this->info['originalUrl'] . '
'; + $this->state = 0; } else { - die('cannot init linkinfo instance: invalid url'); + throw new Exception('LinkInfo: invalid url: ' . $this->info['originalUrl']); } } - function fetch() { + public function fetch() { if ($this->state < 0) { $this->init(); } - - $html = curl_exec($this->ch) or die(curl_error($this->ch)); + + $html = curl_exec($this->ch); + if (!$html) { + throw new Exception(curl_error($this->ch)); + } + $contentType = curl_getinfo($this->ch, CURLINFO_CONTENT_TYPE); - + preg_match( '=^([\w/+-]+)(;\s+charset\=(\S+))?=i', $contentType, $matches ); if ( isset( $matches[1] ) ) $this->info['mime'] = $matches[1]; if ( isset( $matches[3] ) ) $this->info['charset'] = strtoupper($matches[3]); - + $this->info['effectiveUrl'] = curl_getinfo($this->ch, CURLINFO_EFFECTIVE_URL); $this->info['effectiveUrlParsed'] = parse_url($this->info['effectiveUrl']); - + $this->doc = new DOMDocument(); @$this->doc->loadHTML($html); curl_close($this->ch); - + $this->state = 1; } - - - function parse() { + + public function parse() { if ($this->state < 1) { $this->fetch(); } - + $head = $this->doc->getElementsByTagName('head')->item(0); $body = $this->doc->getElementsByTagName('body')->item(0); - + // title & base $this->info['title'] = $this->escapeTagValue($head->getElementsByTagName('title')->item(0)->nodeValue); - + if ($head->getElementsByTagName('base')->length > 0) { $this->info['baseUrl'] = $head->getElementsByTagName('base')->item(0)->getAttribute('href'); $this->info['baseUrlParsed'] = parse_url($this->info['baseUrl']); } - + // other tags foreach ($this->otherTagNames as $tn) { $this->info[$tn] = array(); - - foreach ($body->getElementsByTagName($tn) as $t) - $this->info[$tn][] = $this->escapeTagValue($t->nodeValue); + + foreach ($body->getElementsByTagName($tn) as $t) { + $this->info[$tn][] = $this->escapeTagValue($t->nodeValue); + } } - + // meta tags $metaTags = $head->getElementsByTagName('meta'); foreach ($metaTags as $mt) { @@ -111,111 +113,115 @@ class LinkInfo { $this->info[$mt->getAttribute('name')] = $mt->getAttribute('content'); } } - + // keywords - $this->info['keywordsSeperated'] = array(); $this->info['keywordsSeperated'] = explode(',', $this->info['keywords']); $this->info['keywordsSeperated'] = array_map('trim', $this->info['keywordsSeperated']); $this->info['keywordsSeperated'] = array_filter($this->info['keywordsSeperated']); - - // favicon - $ch = curl_init(); - curl_setopt($ch, CURLOPT_URL, $this->info['effectiveUrlParsed']['scheme'] . '://' . $this->info['effectiveUrlParsed']['host'] . '/favicon.ico'); - curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); - curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); - curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3'); - if (curl_exec($ch)) { - if (curl_getinfo($ch, CURLINFO_SIZE_DOWNLOAD) > 0 && preg_match('=^image/=', curl_getinfo($ch, CURLINFO_CONTENT_TYPE))) - $ico = '/favicon.ico'; - - curl_close($ch); - } - + // favicon foreach ($head->getElementsByTagName('link') as $link) { if (in_array($link->getAttribute('rel'), array('shortcut icon', 'icon'))) $ico = $link->getAttribute('href'); } - - if (isset($ico)) + + if (!isset($ico)) { + $ch = curl_init(); + curl_setopt($ch, CURLOPT_URL, $this->info['effectiveUrlParsed']['scheme'] . '://' . $this->info['effectiveUrlParsed']['host'] . '/favicon.ico'); + curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + curl_setopt($ch, CURLOPT_USERAGENT, $this->ua); + curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 1); + echo 'request: ' . $this->info['effectiveUrlParsed']['scheme'] . '://' . $this->info['effectiveUrlParsed']['host'] . '/favicon.ico' . '
'; + + try { + if (curl_exec($ch)) { + if (curl_getinfo($ch, CURLINFO_SIZE_DOWNLOAD) > 0 && preg_match('=^image/=', curl_getinfo($ch, CURLINFO_CONTENT_TYPE))) { + $ico = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL); + } + curl_close($ch); + } + } catch (Exception $e) { }; + } + + if (isset($ico)) { $this->info['favicon'] = $this->toAbsoluteUrl(parse_url($ico), $this->info['effectiveUrlParsed'], $this->info['baseUrlParsed']); - + } + // images from html code $images = array(); foreach ($body->getElementsByTagName('img') as $img) { $images[] = $img->getAttribute('src'); } - + // site specific images if (preg_match('=youtube\.com/watch\?v\=([a-zA-Z0-9]+)=', $this->info['effectiveUrl'], $matches)) { $images[] = 'http://img.youtube.com/vi/' . $matches[1] . '/0.jpg'; } - + + array_unique($images); + $this->info['images'] = array(); foreach ($images as $n => $img) { $url = $this->toAbsoluteUrl(parse_url($img), $this->info['effectiveUrlParsed'], $this->info['baseUrlParsed']); + echo $url . '
'; $size = getimagesize($url); - + if ($size) $this->info['images'][] = array('url' => $url, 'size' => $size); } - - array_unique($this->info['images']); - + array_filter($this->info['images'], function($image) { return $image['size'][0] * $image['size'][1] > 3500 && in_array($image['size'][2], array(IMAGETYPE_GIF, IMAGETYPE_JPEG, IMAGETYPE_PNG, IMAGETYPE_BMP, IMAGETYPE_TIFF_II, IMAGETYPE_TIFF_MM)); }); - + usort($this->info['images'], function($a, $b) { $a = $a['size'][0] * $a['size'][1]; $b = $b['size'][0] * $b['size'][1]; - + return $b - $a; }); - + $this->state = 2; } - - private function escapeTagValue($value) { + + protected function escapeTagValue($value) { $value = mb_convert_encoding($value, 'UTF-8', array($this->info['charset'] , 'auto')); $value = strip_tags($value); $value = preg_replace('=(\s){2,}=', ' ', $value); $value = trim($value); $value = html_entity_decode($value, ENT_COMPAT); - + return $value; } - - private function toAbsoluteUrl($url, $effectiveUrl, $baseUrl = '') { - if (empty($url['scheme'])) { // missing scheme - if ($url['path'][0] == '/' && empty($url['scheme']) && empty($url['host'])) { // absolute + + protected function toAbsoluteUrl($url, $effectiveUrl, $baseUrl = NULL) { + if (empty($url['scheme'])) { // missing scheme + if ($url['path'][0] == '/' && empty($url['host'])) { // absolute $absUrl = $effectiveUrl['scheme'] . '://' . $effectiveUrl['host']; } - else { // relative - if (!empty($baseUrl)) { - if (!empty($baseUrl['scheme']) && !empty($baseUrl['host'])) - $u .= $baseUrl['scheme'] . '://' . $baseUrl['host']; - - $absUrl = $u . dirname($baseUrl['path']) . '/'; + else { // relative + if (isset($baseUrl)) { + $absUrl = $baseUrl['scheme'] . '://' . $baseUrl['host'] . $baseUrl['path']; + } + else { + $absUrl = $effectiveUrl['scheme'] . '://' . $effectiveUrl['host'] . $effectiveUrl['path'] . '/'; } - else - $absUrl = $effectiveUrl['scheme'] . '://' . $effectiveUrl['host'] . dirname($effectiveUrl['path']) . '/'; } } else { $absUrl = $url['scheme'] . '://' . $url['host']; } - - $absUrl .= pathinfo($url['path'], PATHINFO_DIRNAME) . '/'; - $absUrl .= rawurlencode(pathinfo($url['path'], PATHINFO_BASENAME)); - + + $absUrl .= $url['path']; + if (!empty($url['query'])) $absUrl .= '?' . $url['query']; - + if (!empty($url['fragment'])) $absUrl .= '#' . $url['fragment']; - + return $absUrl; } } diff --git a/link_preview/test.php b/link_preview/test.php index 002b515..a0a8300 100644 --- a/link_preview/test.php +++ b/link_preview/test.php @@ -1,4 +1,5 @@ @@ -8,39 +9,39 @@ header('Content-type: text/html; charset=UTF-8'); font-family: "lucida grande", tahoma, verdana, arial, sans-serif; font-size: 11px; } - + a { color: #3B5998; text-decoration: none; } - + a img { border: 0; } - + .link { padding: 5px; } - + .link-container { width: 450px; } - + .link-title { font-weight: bold; } - + .link + hr { border: 1px solid #606060; clear: both; } - + .link-icon { height: 16px; width: 16px; margin: 0 4px -3px 0; } - + .link > a > img { max-height: 250px; max-width: 120px; @@ -48,12 +49,12 @@ header('Content-type: text/html; charset=UTF-8'); margin-right: 10px; margin-bottom: 5px; } - + .link > div { line-height: 14px; color: #808080; } - + .link-keywords > span { color: black; } @@ -67,61 +68,61 @@ header('Content-type: text/html; charset=UTF-8'); function limit($text, $chars = 400, $whitespace = ' ') { $arr = explode($whitespace, $text); - + while (strlen($limited) < $chars) $limited .= array_shift($arr) . ' '; - + if (strlen($text) > strlen($limited)) $limited .= '...'; - + return $limited; } -error_reporting(E_ALL ^ E_NOTICE); - include 'linkinfo.php'; -$urls = array('http://google.de', - 'http://de.wikipedia.org/wiki/Favicon', - 'http://code.google.com/p/kaytwo-i18n/source/browse/trunk/de_DE/de_DE.po', - 'http://katalyse.de/dokuwiki/doku.php?id=edv:passwoerter', - 'http://katalyse.de/', - 'http://www.youtube.com/watch?v=Ioz4lq2lS0E', - 'http://de.selfhtml.org/css/formate/zentrale.htm', - 'http://www.reichelt.de', - 'http://de2.php.net/manual/en/function.str-split.php', - 'http://www.umweltlexikon-online.de/fp/archiv/RUBernaehrunglebensmittel/Brocaindex.php', - 'http://cfranke.com'); +$urls = array( + 'http://google.de', + 'http://de.wikipedia.org/wiki/Favicon', + 'http://code.google.com/p/kaytwo-i18n/source/browse/trunk/de_DE/de_DE.po', + 'http://katalyse.de/dokuwiki/doku.php?id=edv:passwoerter', + 'http://katalyse.de/', + 'http://www.youtube.com/watch?v=Ioz4lq2lS0E', + 'http://de.selfhtml.org/css/formate/zentrale.htm', + 'http://www.reichelt.de', + 'http://de2.php.net/manual/en/function.str-split.php', + 'http://www.umweltlexikon-online.de/fp/archiv/RUBernaehrunglebensmittel/Brocaindex.php', + 'http://cfranke.com' +); foreach ($urls as $url) { $li = new LinkInfo($url); - + echo '
';