Giv*_*Now 3 python selenium beautifulsoup python-requests
我需要从网站上抓取电子邮件.它在浏览器中可见但是当我尝试用请求刮取它时\ BeautifulSoup我得到了这个:"[email protected]"
我可以用Selenium做到这一点,但它需要更多的时间,我想知道是否有可能用请求\ BeautifulSoup刮掉这些电子邮件?也许需要使用一些库来处理js.
电子邮件标签:
<span id="signature_email"><a class="__cf_email__" href="/cdn-cgi/l/email-protection" data-cfemail="30425f5e70584346515c5c531e535f5d">[email protected]</a><script data-cfhash='f9e31' type="text/javascript">/* <![CDATA[ */!function(t,e,r,n,c,a,p){try{t=document.currentScript||function(){for(t=document.getElementsByTagName('script'),e=t.length;e--;)if(t[e].getAttribute('data-cfhash'))return t[e]}();if(t&&(c=t.previousSibling)){p=t.parentNode;if(a=c.getAttribute('data-cfemail')){for(e='',r='0x'+a.substr(0,2)|0,n=2;a.length-n;n+=2)e+='%'+('0'+('0x'+a.substr(n,2)^r).toString(16)).slice(-2);p.replaceChild(document.createTextNode(decodeURIComponent(e)),c)}p.removeChild(t)}}catch(u){}}()/* ]]> */</script></span></span> <span class="separator">|</span>
Run Code Online (Sandbox Code Playgroud)
Roh*_*man 14
所有语言的代码都在这里:
JavaScript
function cfDecodeEmail(encodedString) {
var email = "", r = parseInt(encodedString.substr(0, 2), 16), n, i;
for (n = 2; encodedString.length - n; n += 2){
i = parseInt(encodedString.substr(n, 2), 16) ^ r;
email += String.fromCharCode(i);
}
return email;
}
console.log(cfDecodeEmail("543931142127353935313e352e7a373b39")); // usage
Run Code Online (Sandbox Code Playgroud)
Python
def cfDecodeEmail(encodedString):
r = int(encodedString[:2],16)
email = ''.join([chr(int(encodedString[i:i+2], 16) ^ r) for i in range(2, len(encodedString), 2)])
return email
print cfDecodeEmail('543931142127353935313e352e7a373b39') # usage
Run Code Online (Sandbox Code Playgroud)
PHP
function cfDecodeEmail($encodedString){
$k = hexdec(substr($encodedString,0,2));
for($i=2,$email='';$i<strlen($encodedString)-1;$i+=2){
$email.=chr(hexdec(substr($encodedString,$i,2))^$k);
}
return $email;
}
echo cfDecodeEmail('543931142127353935313e352e7a373b39'); // usage
Run Code Online (Sandbox Code Playgroud)
去
package main
import (
"bytes"
"strconv"
)
func cf(a string) (s string) {
var e bytes.Buffer
r, _ := strconv.ParseInt(a[0:2], 16, 0)
for n := 4; n < len(a)+2; n += 2 {
i, _ := strconv.ParseInt(a[n-2:n], 16, 0)
e.WriteString(string(i ^ r))
}
return e.String()
}
func main() {
email := cf("543931142127353935313e352e7a373b39") // usage
print(email)
print("\n")
}
Run Code Online (Sandbox Code Playgroud)
C++
#include <iostream>
#include <string>
using namespace std;
string cfDecodeEmail(string encodedString);
int main()
{
cout << cfDecodeEmail("543931142127353935313e352e7a373b39") << endl;
}
string cfDecodeEmail(string encodedString)
{
string email;
char xorKey = stoi( encodedString.substr(0, 2), nullptr, 16);
for( unsigned i = 2; i < encodedString.length(); i += 2)
email += stoi( encodedString.substr(i, 2), nullptr, 16) ^ xorKey;
return email;
}
Run Code Online (Sandbox Code Playgroud)
C#
using System;
public class Program
{
public static string cfDecodeEmail(string encodedString)
{
string email = "";
int r = Convert.ToInt32(encodedString.Substring(0, 2), 16), n, i;
for (n = 2; encodedString.Length - n > 0; n += 2)
{
i = Convert.ToInt32(encodedString.Substring(n, 2), 16) ^ r;
char character = (char)i;
email += Convert.ToString(character);
}
return email;
}
public static void Main(string[] args)
{
Console.WriteLine(cfDecodeEmail("543931142127353935313e352e7a373b39")); // usage
}
}
Run Code Online (Sandbox Code Playgroud)
从CF标签中,在您提供的html中,我假设您正在抓一个cloudflare网站.它们提供了一种功能,可以混淆列出的电子邮件(请参阅此处),该电子邮件会对HTML中的地址进行加密,并使用JavaScript对其进行解密.因此,使用selenium你会看到电子邮件地址,但使用你不会的请求.
由于解密方法可以很容易地从JavaScript中获取,因此您可以在Python中编写自己的解密方法.
在JavaScript中,
(function () {
try {
var s, a, i, j, r, c, l = document.getElementById("__cf_email__");
a = l.className;
if (a) {
s = '';
r = parseInt(a.substr(0, 2), 16);
for (j = 2; a.length - j; j += 2) {
c = parseInt(a.substr(j, 2), 16) ^ r;
s += String.fromCharCode(c);
}
s = document.createTextNode(s);
l.parentNode.replaceChild(s, l);
}
} catch (e) {}
})();
Run Code Online (Sandbox Code Playgroud)
在Python中,
def decodeEmail(e):
de = ""
k = int(e[:2], 16)
for i in range(2, len(e)-1, 2):
de += chr(int(e[i:i+2], 16)^k)
return de
Run Code Online (Sandbox Code Playgroud)
| 归档时间: |
|
| 查看次数: |
2573 次 |
| 最近记录: |