- A+
所属分类:编程语言
百度网盘批量 工具 百度云
python代码
- #-*-coding:utf-8-*-
- from selenium import webdriver
- import requests
- import re
- import time
- def get_sitemap():
- req = requests.get("http://blog.az009.com/sitemap-posttype-post.201804.xml/")
- pattern = re.compile("<loc>(http://blog.az009.com/.*?.html)</loc>")
- lists = pattern.findall(req.text)
- # print(lists)
- return lists
- # firefox_profile = webdriver.FirefoxProfile()
- # firefox_profile.set_preference('permissions.default.image', 2) # 某些firefox只需要这个
- # firefox_profile.set_preference('browser.migration.version', 9001) # 部分需要加上这个
- # 禁用css
- # firefox_profile.set_preference('permissions.default.stylesheet', 2)
- # 禁用flash
- # firefox_profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')
- # 禁用js
- # firefox_profile.set_preference('javascript.enabled', 'false')
- browser = webdriver.Firefox()
- # browser = webdriver.Firefox(firefox_profile=firefox_profile)
- for i,url in enumerate(get_sitemap()[13:]):
- try:
- browser.get(url)
- print("正在爬取第: {}个网页".format(i))
- js = "var q=document.documentElement.scrollTop=5000"
- browser.execute_script(js)
- time.sleep(1)
- except Exception:
- continue
- # browser.close()
我的微信公众号
爱真理,得永生! 爱在灵灵久博客,网罗天下,福利大家!