php知道与问问的采集插件代码 |
本文标签:知道,问问,采集插件 最近发现知道和问问小偷的版本越来越多了!! 看过一个百度小偷的网站也达到了pr6 。收录十万多!! 在经过 荐礼啦 四十天的实践之后 发现百度对这个确实挺友好的 。 从网站访问来看 很多也是从百度搜索来的! 所以用知道和问问来填充网站内容还是可行的 。 于是自己开发了一个知道 问问的采集插件 原则上适合 php+mysql 并且文章是在一个表的程序 知道采集代码 复制代码 代码如下: <?php session_start(); header("content-type:text/html;charset=gbk"); require("stole_config.php"); require("conn.php"); require("keyword.php"); $searchStr=$_GET["searchStr"]; $ss=explode(" ",$searchStr);//拆分搜索关键字 $word="";//关键字设为空 foreach($ss as $key=>$t) { if($key>0) { $word .="+"; } $word .=urlencode($t); } $jl=intval($_GET[jl]); if(isset($_GET[page])) { $page=intval($_GET[page]); }else{ $page=1; } $rs=intval($_GET[rs]); if($rs>=10) { $rs=0; $page++; } if($page>76) { echo "采集完毕 ${jl}"; exit(); } if(!empty($searchStr))//如果搜索 { //获取问题页面 $content=@file_get_contents("http://zhidao.baidu.com/q?ct=17&lm=0&tn=ikaslist&pn=".(($page-1)*10)."&rn=10&word=".$word); //获取问题列表 preg_match_all("/<a href=\"\/question\/(.*)\.html/iUs",$content,$uid); $uid=$uid[1];//获取详细页文章 $uid=$uid[$rs]; //判断数据是否存在 $suid="bd{$uid}"; $sct=mysql_query("select count(*) from {$table_prefix}c_article where suid=$suid "); $sct=mysql_fetch_array($sct); $sct=$sct[0]; if($sct==0) { $content=@file_get_contents("http://zhidao.baidu.com/question/".$uid.".html") ; $arr=explode(<cq>,$content); $art_title=$arr[1]; $arr=explode(</cq>,$art_title); $art_title=$arr[0];//获取标题结束 //判断内容是否符合 $word_arr=explode(",",$cj_word); $word_allow=false;//初始化是否允许采集 $word_count=count($word_arr);//关键字总数 for($i=0;$i<$word_count;$i++) { if(substr_count($art_title,$word_arr[$i])>0) { $word_allow=1; $i=$word_count; } } if($word_allow)//如果满足条件 { $arr=explode(<cd><pre>,$content); $contentQuestion=$arr[1]; $arr=explode(</pre></cd>,$contentQuestion); $contentQuestion=$arr[0]; echo "开始采集内容<br>"; echo "$art_title<br>"; @preg_match_all(/(<ca>|<cn>)<pre>(.*)<\/pre>(<\/ca>|<\/cn>)/iUs,$content,$answerArr); $answerArr=$answerArr[2]; if($arr_order==1)//随机排序 { shuffle($answerArr); } if($arr_order==2)//倒序 { $answerArr=krsort($answerArr);//倒序 } foreach($answerArr as $t) { $answerTemp=str_replace(<ca><pre>,,$t); $answerTemp=str_replace(</pre></ca>,,$answerTemp); $answerTemp=str_replace(<cn><pre>,,$answerTemp); $answerTemp=str_replace(</pre></cn>,,$answerTemp); if(strlen($answerTemp)>$min_t1) { $art_content .=$answerTemp."<br>"; } } //去除链接 $s1="/(<a .*>)(.*)<\/a>/iUs"; $art_content=preg_replace($s1,${2},trim($art_content)); $art_content=str_replace("\n\r","<br>",$art_content); if(strlen($art_content)>$min_t2) { $title_ct=mysql_query("select count(*) from {$table_prefix}c_article where art_title =$art_title ");//查看标题是否重复 $title_ct=@mysql_fetch_array($title_ct); $title_ct=$title_ct[0]; if($title_ct>0) { $art_title .="{$same_title}{$title_ct}"; } $art_time=date("Y-m-d"); $art_content=strtr($art_content,$keyword); $sql="insert into {$table_prefix}c_article(art_title,art_content,art_time,art_author,suid) values($art_title,$art_content,$art_time,$art_author,$suid)";//插入采集表 mysql_query($sql); if(empty($t_catx_id))//如果无分类 { $sql2="insert into {$t_table}({$t_art_title},{$t_art_content},{$t_art_time},{$t_artx_author}) values($art_title,$art_content,$art_time,$art_author)"; }else { $sql2="insert into {$t_table}({$t_art_title},{$t_art_content},{$t_art_time},{$t_artx_author},{$t_catx_id}) values($art_title,$art_content,$art_time,$art_author,$cat_id)"; } mysql_query($sql2);//插入文章表 $jl++; //数据库处理完毕 }else { echo "内容长度不够"; } //获取文章内容结束 }else { echo "主题不符合要求"; } }else { echo "已经存在"; }$rs++; file_put_contents("bd.txt","采集{$searchStr}到第{$page}第{$rs}条"); echo "<script>location.href=baidu.php?searchStr=".urlencode($searchStr)."&page=".$page."&rs=".$rs."&jl=".$jl." ;</script>"; exit(); } ?> <link href="style.css" rel="stylesheet" type="text/css" /> <table width="700" border="0" align="center" cellspacing="1" bgcolor="#CCCCCC"> <tr> <td height="50" align="center" bgcolor="#00CC00"><h1><a href="http://www.jianlila.com">荐礼啦</a>知道问问采集插件</h1></td> </tr> </table> <table width="700" border="0" align="center" cellspacing="1" bgcolor="#CCCCCC" style="margin-top:6px; margin-bottom:6px;"> <tr> <td height="30" align="center" bgcolor="#FFFFFF"><a href="cj_config.php">采集设置</a> <a href="uninstall.php" onclick="return confirm(您确定要卸载采集插件吗);">卸载采集</a> <a href="cj_view.php">查看采集记录</a> <a href="cj_help.php">采集帮助</a> <a href="baidu.php" target="_blank">知道采集</a> <a href="wenwen.php" target="_blank">问问采集</a></td> </tr> </table> <table width="537" height="45" align="center" style="margin-top:30px;"><tr><td height="39"> <form id="form1" name="form1" method="get" action="baidu.php"> <div id="search"> <input name="searchStr" type="text" id="searchStr" value="<?php echo $searchStr; ?>" size="60" /> <input type="submit" name="searchBtn" id="searchBtn" value="知道偷偷" style="height:25px; line-height:25px;" /> </div> </form> </td></tr></table> 问问采集代码: 复制代码 代码如下: <?php session_start(); header("content-type:text/html;charset=utf-8"); require("stole_config.php"); require("conn.php"); require("keyword.php"); if(!empty($_POST[ask])) { $ask=urlencode(trim($_POST[ask]));//获取表单提交的问题 $sp="S".$ask; }else { $sp=urlencode($_GET[sp]); } if(empty($_GET[jl])) { $_GET[jl]=1; } $jl=$_GET[jl]; $pg=intval($_GET[pg]);//获取页数 $rs=intval($_GET[rs]);//获得 记录的参数 if($rs>9) { $rs=0; $pg++; } if($pg>51) { echo "采集完毕! 总共采集 ".urldecode($sp)." ".$jl."条记录"; exit(); } if($sp)//有设定答案才开始 { $str=@file_get_contents("http://wenwen.soso.com/z/Search.e?sp={$sp}&pg={$pg}"); @preg_match("/<ol class=\"result_list\">(.*)<\/ol>/iUs",$str,$asklist);//获取问答列表 //echo $asklist[1]; $url="/<a target=\"_blank\" href=\"\/z\/(q.*\.htm)/iUs"; @preg_match_all($url,$asklist[1],$urllist);//获取 所有的问题 $t=$urllist[1][$rs]; $uid=$t; $suid="ww{$uid}"; $sct=mysql_query("select count(*) from {$table_prefix}c_article where suid=$suid "); $sct=mysql_fetch_array($sct); $sct=$sct[0]; if($sct==0) { $html=@file_get_contents("http://wenwen.soso.com/z/${t}"); $html=str_replace("<pre>","",str_replace("</pre>","",$html)); $html=str_replace("<br/><br/><br/>","<br/><br/>",$html); //echo $html; @preg_match("/<div class=\"question_main\">.*<h3>(.*)<\/h3>/iUs",$html,$ask_title); $art_title=$ask_title[1]; @preg_match("/<div class=\"answer_con\">(.*)<\/div>/iUs",$html,$answer); $j=count($answer)-1; $art_content="";//商品详细 for($i=$j;$i>=1;$i--) { if(strlen($answer[$i])>$min_t1) { $art_content .= $answer[$i]; } } $art_content=trim($art_content); $s1="/(<a .*>)(.*)<\/a>/iUs"; $art_content=preg_replace($s1,${2},trim($art_content)); $word_arr=explode(",",iconv("gbk","utf-8",$cj_word)); $word_allow=false;//初始化是否允许采集 $word_count=count($word_arr);//总数 for($i=0;$i<$word_count;$i++) { if(substr_count($art_title,$word_arr[$i])>0) { $word_allow=1; $i=$word_count; } } if($word_allow)//如果合法 { //开始处理数据库 if(strlen($art_content)>$min_t2) { echo "<font color=red>添加中............................</font><br>"; echo $art_title."<br>"; $art_title=iconv(utf-8,gbk, $art_title); $title_ct=mysql_query("select count(*) from {$table_prefix}c_article where art_title =$art_title ");//查看标题是否重复 $title_ct=@mysql_fetch_array($title_ct); $title_ct=$title_ct[0]; if($title_ct>0) { $art_title .="{$same_title}{$title_ct}"; } $art_content=iconv(utf-8,gbk,str_replace("\r\n","<br>",$art_content)); $art_content=strtr($art_content,$keyword); $art_time=date("Y-m-d"); $sql="insert into {$table_prefix}c_article(art_title,art_content,art_time,art_author,suid) values($art_title,$art_content,$art_time,$art_author,$suid)";//插入采集表 mysql_query($sql); if(empty($t_catx_id))//如果无分类 { $sql2="insert into {$t_table}({$t_art_title},{$t_art_content},{$t_art_time},{$t_artx_author}) values($art_title,$art_content,$art_time,$art_author)"; }else { $sql2="insert into {$t_table}({$t_art_title},{$t_art_content},{$t_art_time},{$t_artx_author},{$t_catx_id}) values($art_title,$art_content,$art_time,$art_author,$cat_id)"; } mysql_query($sql2);//插入文章表 $jl++;//如果存放数据库中 则记录加1 //处理数据库结束 }else { echo "长度不够"; } }else { echo "主题不符合要求"; } }else { echo "已经存在"; } $rs++; //记录下本次采集 的状况 $f_tt= urldecode($sp)."--页数".$pg." 记录数 ".$jl ; file_put_contents("ss.txt",$f_tt); echo "<script>location.href=wenwen.php?jl=".$jl."&sp=".$sp."&pg=".$pg."&rs=".$rs." ;</script>"; exit(); } ?> <link href="style.css" rel="stylesheet" type="text/css" /> <table width="700" border="0" align="center" cellspacing="1" bgcolor="#CCCCCC"> <tr> <td height="50" align="center" bgcolor="#00CC00"><h1><a href="http://www.jianlila.com">荐礼啦</a>知道问问采集插件</h1></td> </tr> </table> <table width="700" border="0" align="center" cellspacing="1" bgcolor="#CCCCCC" style="margin-top:6px; margin-bottom:6px;"> <tr> <td height="30" align="center" bgcolor="#FFFFFF"><a href="cj_config.php">采集设置</a> <a href="uninstall.php" onclick="return confirm(您确定要卸载采集插件吗);">卸载采集</a> <a href="cj_view.php">查看采集记录</a> <a href="cj_help.php">采集帮助</a> <a href="baidu.php" target="_blank">知道采集</a> <a href="wenwen.php" target="_blank">问问采集</a></td> </tr> </table> <form action="wenwen.php" method="post"> <table width="628" height="49" border="0" align="center"> <tr> <td width="413" align="right"><input name="ask" type="text" id="ask" size="50"></td> <td width="205"><input type="submit" name="button" id="button" value="问问采集" style=" padding-left:15px; padding-right:15px; height:25px; line-height:25px;"></td> </tr> </table> </form> |