基于python爬取豆瓣图书信息电影封面?

python使用requests模块请求网址,使用lxml模块中etree抓取数据,并使用time模块延时爬取的页面为:运行结果如下图所示:python代码如下:在这里插入代码片
# _*_ coding:utf _*_
# 邮箱:3195841740@qq.com
# 人员:21292
# 日期:2020/3/8 11:05
# 工具:PyCharm
import requests
from lxml import etree
import re
import time
headers = {
'Cookie': 'll="118375"; bid=LweMDRu6xy0; __utmz=30149280.1582607485.1.1.utmcsr=sogou.com|utmccn=(referral)|utmcmd=referral|utmcct=/link; __utmz=223695111.1583572638.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __yadk_uid=sIlSb4fUzktAAB7ns01bryACK9TG0Ytt; _vwo_uuid_v2=D4D430B7FCD55769AFD16F4AB7B8A5907|ae49228565fb206135f49f584eb2c78e; __gads=ID=a3adab5ce8eafc57:T=1583573105:S=ALNI_MYInfQ1FlG09Ho82DR2aEpSSXRC_Q; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1583668047%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; ap_v=0,6.0; __utma=30149280.502674428.1582607485.1583572636.1583668047.3; __utmb=30149280.0.10.1583668047; __utmc=30149280; __utma=223695111.2100654023.1583572638.1583572638.1583668047.2; __utmb=223695111.0.10.1583668047; __utmc=223695111; _pk_id.100001.4cf6=96f806704894c344.1583572638.2.1583668060.1583573242.',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
}
def print_movies(movies):
print("*"*100)
for each in movies:
time.sleep(0.5)
if each == '主演':
for x in range(len(movies['主演'])):
time.sleep(0.5)
if x == 0:
print(each,':',movies['主演'][x])
else:
print('
',movies['主演'][x])
else:
print(each,':',movies[each])
print("*" * 100)
def HTML_spider_detial(url):
response = requests.get(url,headers = headers,allow_redirects=False)
text = response.content.decode('utf-8')
html = etree.HTML(text)
return html
def get_detial_urls(url):
html = HTML_spider_detial(url)
detial_urls = html.xpath('//div[@class = "info"]/div[1]//@href')
for detial_url in detial_urls:
print('正在请求网址:',detial_url,'中....................')
spider_detials_url(detial_url)
time.sleep(0.5)
def spider_detials_url(url):
movies = {}
html = HTML_spider_detial(url)
movie_name = html.xpath('//div[@id = "content"]/h1/span/text()')[0]
movies['电影名称'] = movie_name
movie_year = html.xpath('//div[@id = "content"]/h1/span/text()')[1]
movie_year = re.findall(r'[(](.*?)[)]',movie_year)[0]
movies['年份'] = movie_year
movie_drector = html.xpath('//div[@id = "info"]/span[1]/span[2]/a/text()')[0]
movies['导演'] = movie_drector
movie_value = html.xpath('//div[@class ="rating_self clearfix" ]/strong/text()')[0]
movies['豆瓣评分'] = movie_value
movie_actors = html.xpath('//div[@id = "info"]/span[3]/span[2]//a/text()')
movies['主演'] = movie_actors
infos = html.xpath('//div[@id = "info"]//text()')
for index in range(0,len(infos),1):
if infos[index] == "语言:":
movie_language = infos[index+1]
movies['语言'] = movie_language
elif infos[index] == '上映日期:':
movie_time = infos[index+2]
movies["上映时间"] = movie_time
elif infos[index] == "片长:":
movie_long = infos[index+2]
movies['片长'] = movie_long
movie_simple = html.xpath('//span[@property="v:summary"]/text()')[0].strip()
movies['电影简介'] = movie_simple
print_movies(movies)
if __name__ == '__main__':
for i in range(0,10,1):
count = i*25
url = 'https://movie.douban.com/top250?start='+str(count)+'&filter='
get_detial_urls(url)
修改代码,将数据储存到excel中运行如下:详细代码如下:在这里插入代码片
# _*_ coding:utf _*_
# 邮箱:3195841740@qq.com
# 人员:21292
# 日期:2020/3/8 11:05
# 工具:PyCharm
import requests
from lxml import etree
import re
import time
import openpyxl
#创建headers 若爬取过程中出现错误,修改这两个参数
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0',
'Cookie': 'll="118375"; bid=Gnw8x-tUTyQ; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1583715518%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_id.100001.4cf6=dac2da16aa651d16.1582123086.16.1583716968.1583677611.; __yadk_uid=skuIGmPsoBorvw32ahEZf6sqfam16Rtj; __utma=30149280.1846912402.1582123089.1583715518.1583728695.18; __utmz=30149280.1583677417.16.10.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utma=223695111.1046665669.1582123089.1583677417.1583715518.16; __utmz=223695111.1583677417.15.9.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __gads=ID=e685b433defaf33d:T=1582374422:S=ALNI_MbPZ69DTsUEApb-_etqVfoNXEAO5g; ct=y; _vwo_uuid_v2=D630691FC9560003A3D895CB985C1B204|b22c83cc5ece5c99126b175f0bd35663; push_noty_num=0; push_doumail_num=0; __utmv=30149280.18932; __utmc=30149280; __utmc=223695111; dbcl2="189323804:LgqGltbQVkw"; ck=p11c; __utmb=30149280.6.9.1583728745418; __utmt=1'
}
#将爬取到的数据写入excel
def save_detials_movies(movies_list,count = [1]):
file = openpyxl.Workbook()
sheet = file.active
sheet.title = "电影数据"
#excel的标题
sheet['A1'] = '电影名称'
sheet['B1'] = '年份'
sheet['C1'] = '导演'
sheet['D1'] = '豆瓣评分'
sheet['E1'] = '主演'
sheet['F1'] = '语言'
sheet['G1'] = '上映时间'
sheet['H1'] = '片长'
sheet['I1'] = '电影简介'
number = count[0]+1
count[0] = number
#定义行数
number = 2
while number < len(movies_list):
for i in range(0, len(movies_list), 1):
sheet['A' + str(number)] = movies_list[number]['电影名称']
sheet['B' + str(number)] = movies_list[number]['年份']
sheet['C' + str(number)] = movies_list[number]['导演']
sheet['D' + str(number)] = movies_list[number]['豆瓣评分']
actor = ''
for each in movies_list[number]["主演"]:
actor = actor + ' ' + each
sheet['E' + str(number)] = actor
sheet['F' + str(number)] = movies_list[number]['语言']
sheet['G' + str(number)] = movies_list[number]['上映时间']
sheet['H' + str(number)] = movies_list[number]['片长']
sheet['I' + str(number)] = movies_list[number]['电影简介']
number = number + 1
#保存excel
file.save('豆瓣电影.xlsx')
#输出movies
def print_movies(movies):
print("*"*100)
for each in movies:
time.sleep(0.5)
if each == '主演':
for x in range(len(movies['主演'])):
time.sleep(0.5)
if x == 0:
print(each,':',movies['主演'][x])
else:
print('
',movies['主演'][x])
else:
print(each,':',movies[each])
print("*" * 100)
#对网址进行解析,并返回
def HTML_spider_detial(url):
response = requests.get(url,headers = headers,allow_redirects=False)
text = response.content.decode('utf-8')
html = etree.HTML(text)
return html
#提取网页中的电影url
def get_detial_urls(url,count = [1]):
html = HTML_spider_detial(url)
detial_urls = html.xpath('//div[@class = "info"]/div[1]//@href')
#提取每页中详细的url地址
for detial_url in detial_urls:
print('正在请求第'+str(count[0])+'个网址:',detial_url,'中....................')
spider_detials_url(detial_url)
count[0] = count[0]+1
#延时
time.sleep(0.5)
#定义数组,将所有的电影数据储存在这个数组内
movies_list = []
#对电影数据的爬取,保存到movies字典中
def spider_detials_url(url):
#定义空的字典,用于储存爬取的数据
movies = {}
html = HTML_spider_detial(url)
#电影名称
movie_name = html.xpath('//div[@id = "content"]/h1/span/text()')[0]
movies['电影名称'] = movie_name
#电影上映时间
movie_year = html.xpath('//div[@id = "content"]/h1/span/text()')[1]
movie_year = re.findall(r'[(](.*?)[)]',movie_year)[0]
movies['年份'] = movie_year
#导演
movie_drector = html.xpath('//div[@id = "info"]/span[1]/span[2]/a/text()')[0]
movies['导演'] = movie_drector
#豆瓣评分
movie_value = html.xpath('//div[@class ="rating_self clearfix" ]/strong/text()')[0]
movies['豆瓣评分'] = movie_value
#电影演员
movie_actors = html.xpath('//div[@id = "info"]/span[3]/span[2]//a/text()')
movies['主演'] = movie_actors
infos = html.xpath('//div[@id = "info"]//text()')
for index in range(0,len(infos),1):
#电影语言
if infos[index] == "语言:":
movie_language = infos[index+1]
movies['语言'] = movie_language
#电影具体的上映日期
elif infos[index] == '上映日期:':
movie_time = infos[index+2]
movies["上映时间"] = movie_time
#电影时长
elif infos[index] == "片长:":
movie_long = infos[index+2]
movies['片长'] = movie_long
#电影简介
movie_simple = html.xpath('//span[@property="v:summary"]/text()')[0].strip()
movies['电影简介'] = movie_simple
#print_movies(movies)
movies_list.append(movies)
save_detials_movies(movies_list)
#开始运行爬虫
def start():
for i in range(0,10,1):
count = i*25
url = 'https://movie.douban.com/top250?start='+str(count)+'&filter='
get_detial_urls(url)
if __name__ == '__main__':
start()
可以抓取250条电影信息。
打开豆瓣电影F12抓包工具分析AJax请求以喜剧片为例,得到URL和使用GET方法还有其URL参数返回为json类型实现结果如图:代码如下:import requests
import json
#UA伪装
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400'
}
#指定URL
url = 'https://movie.douban.com/j/chart/top_list'
#对URL参数进行封装
params = {
'type':'24',
'interval_id':'100:90',
'action':'',
'start':'0',#从库中的第几部电影取
'limit':'20',#一次取出的电影书目
}
#发起请求
response = requests.get(url=url,params=params,headers=headers)
#获取响应数据
list_data = response.json()#返回一个json类型
#持久化存储
fp = open('./douban.json','w',encoding='utf-8')
json.dump(list_data,fp=fp,ensure_ascii=False)
print("Over")
前言:在掌握一些基础的爬虫知识后,就可以尝试做一些简单的爬虫来练一练手。今天要做的是利用xpath库来进行简单的数据的爬取。我们爬取的目标是电影的名字、导演和演员的信息、评分和url地址。准备环境:Pycharm、python3、爬虫库request、xpath模块、lxml模块第一步:分析url ,理清思路先搜索豆瓣电影top250,打开网站可以发现要爬取的数据不止存在单独的一页,而是存在十页当中,这时应该怎么做呢?首先观察:第一页的url:https://movie.douban.com/top250?start=0&filter=第二页的url:https://movie.douban.com/top250?start=25&filter=第三页的url:https://movie.douban.com/top250?start=50&filter=可以得出一个结论每页的url是有规律的,start后面的数字等于页数减一乘以五。这样就可以用一个循环来爬取。图1.1再来就是通常的头部信息,第一步如下图所示:图1.2第二步:发送请求,利用xpath处理数据Xpath可以通过节点来快速获取需要的信息,并把它提取出来。先打开浏览器的开发者工具。发现所有的电影数据都存放在div[@class="info"]的标签里,通过一级一级的获取,就可以得到想要的信息。可以直接来利用开发者工具的copy,来copy xpath的路径,但一般不推荐,因为很可能出错。图2.1如图所示:图2.2第三步:保存数据我们可以用csv来保存数据图3.1代码如下:import requestsimport lxml.htmlimport csvetree = lxml.html.etreetemp = r'C:\Users\86177\Desktop'+'\\'+'doubanmovie'with open(temp + '.csv', 'w') as f:csvwriter = csv.writer(f, dialect='excel')csvwriter.writerow(['title','info','average','link'])header ={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.7 Safari/537.36'}for n in range(11):url = 'https://movie.douban.com/top250?start=%s&filter='.format((n-1)*5)response = requests.get(url,headers=header)response.encoding = "utf-8"html1 = etree.HTML(response.text)h = html1.xpath('//div[@class="info"]')for sech in h:i = sechtitle = i.xpath('div[@class="hd"]/a/span[@class="title"]/text()')othertitle = i.xpath('div[@class="hd"]/a/span[@class="other"]/text()')info = i.xpath('div[@class="bd"]/p/text()')average = i.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()')link = i.xpath('div[@class="hd"]/a/@href')for a, b, c, d in zip(title, info, average, link):print(a, b, c, d)with open(temp + '.csv', 'w') as f:csvwriter = csv.writer(f, dialect='excel')csvwriter.writerow([a, b, c, d])总结:这次爬取豆瓣的反爬虫机制较少,很适合一些学习爬虫知识的小白来练一练手。提高爬虫实战的能力。END编 辑
王楠岚责 编
王 曦where2go 团队

我要回帖

更多关于 基于python爬取豆瓣图书信息 的文章

 

随机推荐