- A+
所属分类:python网络爬虫
首页测试
- # -*- coding: utf-8 -*-
- """
- 爬取的相关信息有:电影名次、电影名称、主演、上映时间、评分
- """
- import re
- import time
- import requests
- from multiprocessing import Pool
- from requests.exceptions import RequestException
- #根据url获取HTML文件
- def getHTML(url, code='utf-8'):
- #创建个头文件。
- header = {
- 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0',
- }
- try:
- response = requests.get(url, headers=header)
- response.raise_for_status()
- response.encoding = code
- return response.text
- except RequestException:
- print('getHTML Error')
- #对HTML进行正则表达式处理
- def parseHTML(html):
- pattern = re.compile('board-index-.*?">(.*?)</i>.*?class="name">.*?'
- + '"boarditem-click".*?"{movieId:.*?}">+(.*?)</a>.*?class="star">'
- + '(.*?)</p>.*?class="releasetime">(.*?)</p>.*?<p class="score">'
- + '<i class="integer">(.*?)</i><i class="fraction">(.*?)</i></p>', re.S)
- items = re.findall(pattern, html)
- for item in items:
- yield {
- '序号': item[0],
- '电影名': item[1],
- '主演': item[2].strip(),
- '上映时间': item[3],
- '评分': item[4] + item[5],
- }
- #把数据保存到本地
- def writePAGE(content):
- with open('result.txt', 'a' ) as f:
- f.write(str(content) + '\n')
- f.close()
- #编写主函数
- def main(page):
- url = 'https://maoyan.com/board/4?offset=' + str(page)
- html = getHTML(url)
- items = parseHTML(html)
- for item in items:
- print(item)
- writePAGE(item)
- if __name__ == '__main__':
- start = time.time()
- #多进程爬取
- pool = Pool()
- pool.map(main, [page * 10 for page in range(10)])
- pool.close() # 关闭进程池,不接受新的进程
- pool.join() # 主进程阻塞等待子进程的退出
- end = time.time()
- #打印出最后运行的时间
- print('It spends %s s' % (end - start))
我的微信公众号
爱真理,得永生! 爱在灵灵久博客,网罗天下,福利大家!