最终更新目标网站似乎阻止了 DO IP,并给出了我几天来一直在解决的问题。我旋转了一个 EC2 实例并设法使代码与缓存等一起工作,以减少网站上的点击并允许我的用户共享网站。
-
更新:我设法通过将curl错误设置为关闭来获取Html,但是除了返回405错误之外,网站也没有设置加载网站内容所需的一些cookie。
curl_setopt($ch, CURLOPT_FAILONERROR, FALSE);
我正在使用 ajax->PHP 的以下代码来检索网站的 og: meta 。但是,有 1 或 2 个特定站点返回错误并且不会检索信息。有以下错误。该代码可以在大多数网站上无缝运行。
警告:DOMDocument::loadHTML():第58行 /my/home/path/getUrlMeta.php中的输入为空字符串
来自我的 error_log 中的curl_error
请求的 URL 返回错误:405 不允许
和
无法连接到 www.something.com 端口 443:连接被拒绝
当我在服务器控制台上使用curl时,获取网站的html没有问题,并且使用下面的代码检索大多数网站所需的信息也没有问题
function file_get_contents_curl($url)
{
$ch = curl_init();
$header[0] = "Accept: text/html, text/xml,application/xml,application/xhtml+xml,";
$header[0] .= "text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
$header[] = "Cache-Control: max-age=0";
$header[] = "Connection: keep-alive";
$header[] = "Keep-Alive: 300";
$header[] = "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7";
$header[] = "Accept-Language: en-us,en;q=0.5";
$header[] = "Pragma: no-cache";
curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
//curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'GET');
curl_setopt($ch, CURLOPT_FAILONERROR, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
curl_setopt($ch, CURLOPT_USERAGENT,"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:31.0) Gecko/20100101 Firefox/31.0 " );
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
//The following 2 set up lines work with sites like www.nytimes.com
//Update: Added option for cookie jar since some websites recommended it. cookies.txt is set to permission 777. Still doesn't work.
$cookiefile = '/home/my/folder/cookies.txt';
curl_setopt( $ch, CURLOPT_COOKIESESSION, true );
curl_setopt( $ch, CURLOPT_COOKIEJAR, $cookiefile );
curl_setopt( $ch, CURLOPT_COOKIEFILE, $cookiefile );
$data = curl_exec($ch);
if(curl_error($ch))
{
error_log(curl_error($ch));
}
curl_close($ch);
return $data;
}
$html = file_get_contents_curl($url);
libxml_use_internal_errors(true); // Yeah if you are so worried about using @ with warnings
$doc = new DomDocument();
$doc->loadHTML($html);
$xpath = new DOMXPath($doc);
$query = '//*/meta[starts-with(@property, \'og:\')]';
$metas = $xpath->query($query);
$rmetas = array();
foreach ($metas as $meta) {
$property = substr($meta->getAttribute('property'),3);
$content = $meta->getAttribute('content');
$rmetas[$property] = $content;
}
/*below code retrieves the next bigger than 600px image should og:image be empty.*/
if (empty($rmetas['image'])) {
//$src = $xpath->evaluate("string(//img/@src)");
//echo "src=" . $src . "\n";
$query = '//*/img';
$srcs = $xpath->query($query);
foreach ($srcs as $src) {
$property = $src->getAttribute('src');
if (substr($property,0,4) == 'http' && in_array(substr($property,-3), array('jpg','png','peg'), true)) {
if (list($width, $height) = getimagesize($property)) {
do if ($width > 600) {
$rmetas['image'] = $property;
break;
} while (0);
}
}
}
}
echo json_encode($rmetas);
die();
Run Code Online (Sandbox Code Playgroud)
更新:我的错误是网站未启用 https,所以我仍然有 405 not allowed 错误。
卷曲信息
{
"url": "http://www.example.com/",
"content_type": null,
"http_code": 405,
"header_size": 0,
"request_size": 458,
"filetime": -1,
"ssl_verify_result": 0,
"redirect_count": 0,
"total_time": 0.326782,
"namelookup_time": 0.004364,
"connect_time": 0.007725,
"pretransfer_time": 0.007867,
"size_upload": 0,
"size_download": 0,
"speed_download": 0,
"speed_upload": 0,
"download_content_length": -1,
"upload_content_length": -1,
"starttransfer_time": 0.326634,
"redirect_time": 0,
"redirect_url": "",
"primary_ip": "SOME IP",
"certinfo": [],
"primary_port": 80,
"local_ip": "SOME IP",
"local_port": 52966
}
Run Code Online (Sandbox Code Playgroud)
更新:如果我从控制台执行curl -i,我会得到以下响应。错误 405,但后面跟着我需要的所有 HTML。
Home> curl -i http://www.domain.com
HTTP/1.1 405 Not Allowed
Server: nginx
Date: Wed, 22 Feb 2017 17:57:03 GMT
Content-Type: text/html; charset=UTF-8
Transfer-Encoding: chunked
Vary: Accept-Encoding
Vary: Accept-Encoding
Set-Cookie: PHPSESSID2=ko67tfga36gpvrkk0rtqga4g94; path=/; domain=.domain.com
Expires: Thu, 19 Nov 1981 08:52:00 GMT
Cache-Control: no-store, no-cache, must-revalidate, post-check=0, pre-check=0
Pragma: no-cache
Set-Cookie: __PAGE_REFERRER=deleted; expires=Thu, 01-Jan-1970 00:00:01 GMT; Max-Age=0; path=/; domain=www.domain.com
Set-Cookie: __PAGE_SITE_REFERRER=deleted; expires=Thu, 01-Jan-1970 00:00:01 GMT; Max-Age=0; path=/; domain=www.domain.com
X-Repository: legacy
X-App-Server: production-web23:8018
X-App-Server: distil2-kvm:80
Run Code Online (Sandbox Code Playgroud)
将以下内容添加到您的代码中以帮助调试问题:
$info = curl_getinfo($ch);
print_r( $info );
Run Code Online (Sandbox Code Playgroud)
问题很可能如下: