基于Snoopy的PHP近似完美获取网站编码的代码


  本文标签:Snoopy,网站编码

先要到网上下载Snoopy.class.php
调用方法:
复制代码 代码如下:

<?php
require lib/Snoopy.class.php;
require lib/WebCrawl.class.php;//包含下面代码
$go=new WebCrawl(http://www.baidu.com);
echo $go->getCharset();
?>

复制代码 代码如下:

<?php
class WebCrawl
{
private $url;
private $request;
public $charset_arr=array(
gb2312,
utf-8,
big5,
gbk,
ascii,
cp936,
ibm037,
ibm437,
ibm500,
asmo-708,
dos-720,
ibm737,
ibm775,
ibm850,
ibm852,
ibm855,
ibm857,
ibm00858,
ibm861,
ibm860,
dos-862,
ibm863,
ibm864,
ibm865,
cp866,
ibm869,
ibm870,
windows-874,
cp875,
shift_jis,
ks_c_5601-1987,
ibm1026,
ibm01047,
ibm01047,
ibm01040,
ibm01041,
ibm01042,
ibm01043,
ibm01044,
ibm01045,
ibm01046,
ibm01047,
ibm01048,
ibm01049,
utf-16,
unicodefffe,
windows-1250,
windows-1251,
windows-1252,
windows-1253,
windows-1254,
windows-1255,
windows-1256,
windows-1257,
windows-1258,
johab,
macintosh,
x-mac-japanese,
x-mac-chinesetrad,
x-mac-korean,
x-mac-arabic,
x-mac-hebrew,
x-mac-greek,
x-mac-cyrillic,
x-mac-chinesesimp,
x-mac-romanian,
x-mac-ukrainian,
x-mac-thai,
x-mac-ce,
x-mac-icelandic,
x-mac-turkish,
x-mac-croatian,
x-chinese-cns,
x-cp20001,
x-chinese-eten,
x-cp20003,
x-cp20004,
x-cp20005,
x-ia5,
x-ia5-german,
x-ia5-swedish,
x-ia5-norwegian,
us-ascii,
x-cp20261,
x-cp20269,
ibm273,
ibm277,
ibm278,
ibm280,
ibm284,
ibm285,
ibm290,
ibm420,
ibm423,
ibm424,
x-ebcdic-koreanextended,
ibm-thai,
koi8-r,
ibm871,
ibm880,
ibm905,
ibm00924,
x-cp20936,
x-cp20949,
cp1025,
koi8-u,
iso-8859-1,
iso-8859-2,
iso-8859-3,
iso-8859-4,
iso-8859-5,
iso-8859-6,
iso-8859-7,
iso-8859-8,
iso-8859-9,
iso-8859-13,
iso-8859-15,
x-europa,
iso-8859-8-i,
iso-2022-jp,
csiso2022jp,
iso-2022-jp,
iso-2022-kr,
x-cp50227,
euc-jp,
euc-cn,
euc-kr,
hz-gb-2312,
gb18030,
x-iscii-de,
x-iscii-be,
x-iscii-ta,
x-iscii-te,
x-iscii-as,
x-iscii-or,
x-iscii-ka,
x-iscii-ma,
x-iscii-gu,
x-iscii-pa,
utf-7,
utf-32,
utf-32be
);
public function __construct($url)
{
$this->url=$url;
}
//打开网站
private function open($url)
{
if($this->request!==null)
{
if($this->request->status==200)
{
return true;
}
else
{
return false;
}
}
else
{
$this->request=new Snoopy();
$this->request->fetch($url);
if($this->request->status==200)
{
$this->request->results=strtolower($this->request->results);
$charset=$this->getCharset();
if($charset!="utf-8")
{
if($charset=="windows-1252")
{
$this->request->results=$this->uni_decode($this->request->results);
}
else
{
$this->request->results=mb_convert_encoding($this->request->results,"UTF-8",$charset);
}
}
return true;
}
else
{
return false;
}
}
}
//获取网站title,keywords,description
public function getWebinfo()
{
$info=array(
title=>,
keywords=>,
desc=>,
ip=>
);
if(!$this->open($this->url)){return $info;exit;}
// print_r($this->request->results);exit;
preg_match(/<title>([^>]*)<\/title>/si, $this->request->results, $titlematch );
if (isset($titlematch) && is_array($titlematch) && count($titlematch) > 0)
{
$info[title] = strip_tags($titlematch[1]);
}
preg_match_all(/<[\s]*meta[\s]*name="? . ([^>"]*)"?[\s]* . content="?([^>"]*)"?[\s]*[\/]?[\s]*>/si, $this->request->results, $match);
$ft=0;
foreach($match[1] as $mt)
{
if($mt=="keywords" || $mt=="description")
{
$ft=1;
}
}
if($ft==0)
{
preg_match_all(/<[\s]*meta[\s]*content="?([^>"]*)"?[\s]*name="? . ([^>"]*)"?[\s]*[\/]?[\s]*>/si, $this->request->results, $match);
if (isset($match) && is_array($match) && count($match) == 3)
{
$originals = $match[0];
$names = $match[2];
$values = $match[1];
if (count($originals) == count($names) && count($names) == count($values))
{
$metaTags = array();
for ($i=0, $limiti=count($names); $i < $limiti; $i++)
{
$metaTags[$names[$i]] = array (
html => htmlentities($originals[$i]),
value => $values[$i]
);
}
}
}
}
else
{
if (isset($match) && is_array($match) && count($match) == 3)
{
$originals = $match[0];
$names = $match[1];
$values = $match[2];
if (count($originals) == count($names) && count($names) == count($values))
{
$metaTags = array();
for ($i=0, $limiti=count($names); $i < $limiti; $i++)
{
$metaTags[$names[$i]] = array (
html => htmlentities($originals[$i]),
value => $values[$i]
);
}
}
}
}
$result = array (
metaTags => $metaTags
);
if(isset($result[metaTags][keywords][value]))
{
$info[keywords]=$result[metaTags][keywords][value];
}
else
{
$info[keywords]="";
}
if(isset($result[metaTags][description][value]))
{
$info[desc]=$result[metaTags][description][value];
}
else
{
$info[desc]="";
}
$domain=preg_replace(/http\:\/\//si, , $this->url);
$ip=@gethostbyname($domain);
$ip_arr=explode(".", $ip);
if(count($ip_arr)==4)
{
$info[ip]=$ip;
}
return $info;
}
public function t($string,$o)
{
for($i=0;$i<strlen($string);$i++)
{
if(ord($string{$i})<128)
continue;
if((ord($string{$i})&224)==224)
{
//第一个字节判断通过
$char = $string{++$i};
if((ord($char)&128)==128)
{
//第二个字节判断通过
$char = $string{++$i};
if((ord($char)&128)==128)
{
$encoding = "UTF-8";
break;
}
}
}
if((ord($string{$i})&192)==192)
{
//第一个字节判断通过
$char = $string{++$i};
if((ord($char)&128)==128)
{
//第二个字节判断通过
$encoding = "GB2312";
break;
}
}
}
return strtolower($encoding);
}
function uni_decode ($str, $code = utf-8){
$str = json_decode(preg_replace_callback(/&#(\d{5});/, create_function($dec, return \\\u\.dechex($dec[1]);), ".$str."));
if($code != utf-8){ $str = iconv(utf-8, $code, $str); }
return $str;
}
//获取网站编码
public function getCharset()
{
if(!$this->open($this->url)){return false;exit;}
//首先从html获取编码
preg_match("/<meta.+?charset=[^\w]?([-\w]+)/i",$this->request->results,$temp) ? strtolower($temp[1]):"";
if($temp[1]!="")
{
if(in_array($temp[1], $this->charset_arr))
{
if($temp[1]=="gb2312")
{
$tmp_charset=$this->t($this->request->results,$temp[1]);
if($tmp_charset==$temp[1])
{
return $temp[1];
}
}
else
{
return $temp[1];
}
}
}
if(!empty($this->request->headers))
{
//从header中获取编码
$hstr=strtolower(implode("|||",$this->request->headers));
preg_match("/charset=[^\w]?([-\w]+)/is",$hstr,$lang) ? strtolower($lang[1]):"";
if($lang[1]!="")
{
return $lang[1];
}
}
$encode_arr=array("UTF-8","GB2312","GBK","BIG5","ASCII","EUC-JP","Shift_JIS","CP936","ISO-8859-1","JIS","eucjp-win","sjis-win");
$encoded=mb_detect_encoding($this->request->results,$encode_arr);
if($encoded)
{
return strtolower($encoded);
}
else
{
return false;
}
}
}
?>