function contentCollection($url){
$data = array('list'=>null,'status'=>0);
if(!$url){
$data['info'] = '请传入采集地址';
return $data;
}
if(!preg_match("/^http/", $url)){
$url = 'http://'.$url;
}
preg_match("/^http(s)?:\/\/[^\/]+/", $url, $host_ary);
$start = microtime(true);
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);//将curl_exec()获取的信息以字符串返回,而不是直接输出。
curl_setopt($ch, CURLOPT_FRESH_CONNECT, true);//TRUE 强制获取一个新的连接,而不是缓存中的连接。
//curl_setopt($ch, CURLOPT_FTPAPPEND, true);//为追加写入文件,而不是覆盖。
curl_setopt($ch, CURLOPT_FAILONERROR, true);//当 HTTP 状态码大于等于 400,TRUE 将将显示错误详情。默认情况下将返回页面,忽略 HTTP 代码。
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B137 Safari/601.1');//设置UA
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 5);//在尝试连接时等待的秒数。设置为0,则无限等待。
//设置最大跳转次数
$redirects = 5;
if (!ini_get('open_basedir') && !ini_get('safe_mode')) {
curl_setopt($ch, CURLOPT_HEADER, false);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);//302重定向
curl_setopt($ch, CURLOPT_MAXREDIRS, $redirects);
$content = curl_exec($ch);
} else {
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FORBID_REUSE, false);
do {
$content = curl_exec($ch);
if (curl_errno($ch))
break;
$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
if ($code != 301 && $code != 302)
break;
$header_start = strpos($content, "\r\n")+2;
$headers = substr($content, $header_start, strpos($content, "\r\n\r\n", $header_start)+2-$header_start);
if (!preg_match("!\r\n(?:Location|URI): *(.*?) *\r\n!", $headers, $matches))
break;
curl_setopt($ch, CURLOPT_URL, $matches[1]);
} while (--$redirects);
if (!$redirects){
$data['info'] = '重定向次数太多。';
return $data;
}
}
$http_code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
// 关闭cURL资源,并且释放系统资源
curl_close($ch);
if(200 != $http_code){
$data['info'] = '采集失败,http_code:'.$http_code;
return $data;
}
$content = preg_replace("/<\!--(.*?)-->/is", '', $content);
preg_match_all("/<title[^>]*?>(.*?)<\/title>(.*)<body[^>]*?>(.*?)<\/body>/is", $content, $body,PREG_SET_ORDER);
if(!$body[0]){
$data['info'] = '没有抓取到内容';
return $data;
}
$title = $body[0][1];
$collection_content = $body[0][3];
preg_match_all("/<link[^>]*rel=['|\"]stylesheet['|\"][^>]*>/is", $content, $link);
$link_str = '';
if($link[0]){
$link[0] = preg_replace("/(href=['|\"])\//", '${1}'.$host_ary[0].'/', $link[0]);
$link_str = implode('',$link[0]);
}
$collection_content = preg_replace("/(<img[^>]*?src=['|\"])\//", '${1}'.$host_ary[0].'/', $collection_content);
$collection_content = $link_str . $collection_content;
$data['status'] = 1;
$data['info'] = mb_convert_encoding($collection_content, 'utf-8','GBK,UTF-8,ASCII');
$data['title'] = $title;
return $data;
}