我正在尝试从Google搜索中获取链接列表:
def google_word(word):
headers={'User-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763'}
url = 'https://google.com/search?q={}'.format(word)
res= requests.get(url, headers=headers)
tree= html.fromstring(res.text)
li = tree.xpath("//a[@href]") #list of links that conatin href
y = [link.get('href') for link in li if link.get('href').startswith("https://") if "google" not in link.get('href')]
Run Code Online (Sandbox Code Playgroud)
现在,这段代码收集了以“”开头的正确链接,https://"我还想添加"http://"。为了使该功能生效,我需要添加到列表理解中(我试图一行完成)?
import scrapy
from scrapy.crawler import CrawlerProcess
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.proxy import Proxy, ProxyType
class FooSpider(scrapy.Spider):
name = 'foo'
start_urls = ["https://www.whatismybrowser.com/"]
index=1
def __init__(self, *args, **kwargs):
super(FooSpider, self).__init__(*args, **kwargs)
self.download_delay = 0.25
chrome_options = Options() # Initializing Chrome
#chrome_options.add_argument("--headless")
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument('--ignore-ssl-errors')
IP = '176.31.69.183' # random free proxy from net
PORT = 8080
prox = Proxy()
prox.proxy_type = ProxyType.MANUAL
prox.http_proxy = f'{IP}:{PORT}'
prox.socks_proxy = f'{IP}:{PORT}'
prox.ssl_proxy = f'{IP}:{PORT}'
capabilities = webdriver.DesiredCapabilities.CHROME
prox.add_to_capabilities(capabilities)
self.browser = …Run Code Online (Sandbox Code Playgroud)