基于Snoopy的PHP近似完美获取网站编码的代码 |
本文标签:Snoopy,网站编码 先要到网上下载Snoopy.class.php 调用方法: 复制代码 代码如下: <?php require lib/Snoopy.class.php; require lib/WebCrawl.class.php;//包含下面代码 $go=new WebCrawl(http://www.baidu.com); echo $go->getCharset(); ?> 复制代码 代码如下: <?php class WebCrawl { private $url; private $request; public $charset_arr=array( gb2312, utf-8, big5, gbk, ascii, cp936, ibm037, ibm437, ibm500, asmo-708, dos-720, ibm737, ibm775, ibm850, ibm852, ibm855, ibm857, ibm00858, ibm861, ibm860, dos-862, ibm863, ibm864, ibm865, cp866, ibm869, ibm870, windows-874, cp875, shift_jis, ks_c_5601-1987, ibm1026, ibm01047, ibm01047, ibm01040, ibm01041, ibm01042, ibm01043, ibm01044, ibm01045, ibm01046, ibm01047, ibm01048, ibm01049, utf-16, unicodefffe, windows-1250, windows-1251, windows-1252, windows-1253, windows-1254, windows-1255, windows-1256, windows-1257, windows-1258, johab, macintosh, x-mac-japanese, x-mac-chinesetrad, x-mac-korean, x-mac-arabic, x-mac-hebrew, x-mac-greek, x-mac-cyrillic, x-mac-chinesesimp, x-mac-romanian, x-mac-ukrainian, x-mac-thai, x-mac-ce, x-mac-icelandic, x-mac-turkish, x-mac-croatian, x-chinese-cns, x-cp20001, x-chinese-eten, x-cp20003, x-cp20004, x-cp20005, x-ia5, x-ia5-german, x-ia5-swedish, x-ia5-norwegian, us-ascii, x-cp20261, x-cp20269, ibm273, ibm277, ibm278, ibm280, ibm284, ibm285, ibm290, ibm420, ibm423, ibm424, x-ebcdic-koreanextended, ibm-thai, koi8-r, ibm871, ibm880, ibm905, ibm00924, x-cp20936, x-cp20949, cp1025, koi8-u, iso-8859-1, iso-8859-2, iso-8859-3, iso-8859-4, iso-8859-5, iso-8859-6, iso-8859-7, iso-8859-8, iso-8859-9, iso-8859-13, iso-8859-15, x-europa, iso-8859-8-i, iso-2022-jp, csiso2022jp, iso-2022-jp, iso-2022-kr, x-cp50227, euc-jp, euc-cn, euc-kr, hz-gb-2312, gb18030, x-iscii-de, x-iscii-be, x-iscii-ta, x-iscii-te, x-iscii-as, x-iscii-or, x-iscii-ka, x-iscii-ma, x-iscii-gu, x-iscii-pa, utf-7, utf-32, utf-32be ); public function __construct($url) { $this->url=$url; } //打开网站 private function open($url) { if($this->request!==null) { if($this->request->status==200) { return true; } else { return false; } } else { $this->request=new Snoopy(); $this->request->fetch($url); if($this->request->status==200) { $this->request->results=strtolower($this->request->results); $charset=$this->getCharset(); if($charset!="utf-8") { if($charset=="windows-1252") { $this->request->results=$this->uni_decode($this->request->results); } else { $this->request->results=mb_convert_encoding($this->request->results,"UTF-8",$charset); } } return true; } else { return false; } } } //获取网站title,keywords,description public function getWebinfo() { $info=array( title=>, keywords=>, desc=>, ip=> ); if(!$this->open($this->url)){return $info;exit;} // print_r($this->request->results);exit; preg_match(/<title>([^>]*)<\/title>/si, $this->request->results, $titlematch ); if (isset($titlematch) && is_array($titlematch) && count($titlematch) > 0) { $info[title] = strip_tags($titlematch[1]); } preg_match_all(/<[\s]*meta[\s]*name="? . ([^>"]*)"?[\s]* . content="?([^>"]*)"?[\s]*[\/]?[\s]*>/si, $this->request->results, $match); $ft=0; foreach($match[1] as $mt) { if($mt=="keywords" || $mt=="description") { $ft=1; } } if($ft==0) { preg_match_all(/<[\s]*meta[\s]*content="?([^>"]*)"?[\s]*name="? . ([^>"]*)"?[\s]*[\/]?[\s]*>/si, $this->request->results, $match); if (isset($match) && is_array($match) && count($match) == 3) { $originals = $match[0]; $names = $match[2]; $values = $match[1]; if (count($originals) == count($names) && count($names) == count($values)) { $metaTags = array(); for ($i=0, $limiti=count($names); $i < $limiti; $i++) { $metaTags[$names[$i]] = array ( html => htmlentities($originals[$i]), value => $values[$i] ); } } } } else { if (isset($match) && is_array($match) && count($match) == 3) { $originals = $match[0]; $names = $match[1]; $values = $match[2]; if (count($originals) == count($names) && count($names) == count($values)) { $metaTags = array(); for ($i=0, $limiti=count($names); $i < $limiti; $i++) { $metaTags[$names[$i]] = array ( html => htmlentities($originals[$i]), value => $values[$i] ); } } } } $result = array ( metaTags => $metaTags ); if(isset($result[metaTags][keywords][value])) { $info[keywords]=$result[metaTags][keywords][value]; } else { $info[keywords]=""; } if(isset($result[metaTags][description][value])) { $info[desc]=$result[metaTags][description][value]; } else { $info[desc]=""; } $domain=preg_replace(/http\:\/\//si, , $this->url); $ip=@gethostbyname($domain); $ip_arr=explode(".", $ip); if(count($ip_arr)==4) { $info[ip]=$ip; } return $info; } public function t($string,$o) { for($i=0;$i<strlen($string);$i++) { if(ord($string{$i})<128) continue; if((ord($string{$i})&224)==224) { //第一个字节判断通过 $char = $string{++$i}; if((ord($char)&128)==128) { //第二个字节判断通过 $char = $string{++$i}; if((ord($char)&128)==128) { $encoding = "UTF-8"; break; } } } if((ord($string{$i})&192)==192) { //第一个字节判断通过 $char = $string{++$i}; if((ord($char)&128)==128) { //第二个字节判断通过 $encoding = "GB2312"; break; } } } return strtolower($encoding); } function uni_decode ($str, $code = utf-8){ $str = json_decode(preg_replace_callback(/(\d{5});/, create_function($dec, return \\\u\.dechex($dec[1]);), ".$str.")); if($code != utf-8){ $str = iconv(utf-8, $code, $str); } return $str; } //获取网站编码 public function getCharset() { if(!$this->open($this->url)){return false;exit;} //首先从html获取编码 preg_match("/<meta.+?charset=[^\w]?([-\w]+)/i",$this->request->results,$temp) ? strtolower($temp[1]):""; if($temp[1]!="") { if(in_array($temp[1], $this->charset_arr)) { if($temp[1]=="gb2312") { $tmp_charset=$this->t($this->request->results,$temp[1]); if($tmp_charset==$temp[1]) { return $temp[1]; } } else { return $temp[1]; } } } if(!empty($this->request->headers)) { //从header中获取编码 $hstr=strtolower(implode("|||",$this->request->headers)); preg_match("/charset=[^\w]?([-\w]+)/is",$hstr,$lang) ? strtolower($lang[1]):""; if($lang[1]!="") { return $lang[1]; } } $encode_arr=array("UTF-8","GB2312","GBK","BIG5","ASCII","EUC-JP","Shift_JIS","CP936","ISO-8859-1","JIS","eucjp-win","sjis-win"); $encoded=mb_detect_encoding($this->request->results,$encode_arr); if($encoded) { return strtolower($encoded); } else { return false; } } } ?> |