Python爬取豆瓣电影 Top 250到excel表格!
代码比较简单,直接上code,可作为爬虫入门小demo!需要会一些正则表达式、lxml、requests、openpyxl第三方库的简单使用!代码仅供参考!请勿非法使用!
源码
import requests
import re
from lxml import etree
import os
import time
from openpyxl import Workbook, load_workbook
from openpyxl.styles import Font, colors, Alignment
def getindex(url):
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.37'
}
response = requests.get(url, headers=header)
response.encoding = 'utf-8'
index = response.text
response.close()
return index
def get_index_title(index):
etree_index = etree.HTML(index)
url_tiitle = etree_index.xpath('//title/text()')[0].strip()
# print(url_tiitle)
return url_tiitle
def get_movie_dir(index):
r = re.compile(
r'<div class="item">.*?<a href="(?P<link>.*?)">.*?<span class="title">(?P<name>.*?)'
r'</span>.*?<br>(?P<year>.*?) .*?v:average">(?P<score>.*?)'
r'</span>.*?<span>(?P<people>.*?)</span>', re.S)
movie_list = r.finditer(index)
movie_dir_list = []
for item in movie_list:
movie_dir = item.groupdict()
movie_dir['year'] = movie_dir['year'].strip()
movie_dir_list.append(movie_dir)
# print(movie)
return movie_dir_list
def out_data_in_file(movie_dir_list):
# if os.path.exists("./movie/"):
# os.rmdir("/movie/")
# print('success')
if os.path.exists("./movie/"):
print('在当前目录下已存在movie目录')
else:
try:
os.makedirs('./movie/')
print('已在当前目录下创建movie目录')
except:
print('movie目录创建失败')
with open('./movie/movie.txt', mode='w', encoding='utf-8') as f:
for item_dir in movie_dir_list:
# print(type(item_dir))
f.write(f'电影名称:{item_dir["name"]} 电影年份:{item_dir["year"]} 电影得分:{item_dir["score"]} 评分人数:{item_dir["people"]} 豆瓣链接:{item_dir["link"]}\n')
print('文件已写入并保存')
def out_data_in_excel(movie_dir_list, cor): # movie_dir_list
if os.path.exists('./movie/movie.xlsx'):
excel_movie = load_workbook('./movie/movie.xlsx')
else:
excel_movie = Workbook() # 建立一个工作本
sheet = excel_movie.active # 激活sheet
# 第1行行高
sheet.row_dimensions[1].height = 40
# C列列宽
sheet.column_dimensions['A'].width = 20
sheet.column_dimensions['D'].width = 20
sheet.title = 'movie' # 对sheet进行命名
title = ['电影名称', '电影年份', '电影得分', '评分人数', '豆瓣链接']
for index, item in enumerate(title):
sheet.cell(1, index + 1).value = item
for item_dir in movie_dir_list:
sheet.cell(cor, 1).value = item_dir["name"]
sheet.cell(cor, 2).value = item_dir["year"]
sheet.cell(cor, 3).value = item_dir["score"]
sheet.cell(cor, 4).value = item_dir["people"]
sheet.cell(cor, 5).value = item_dir["link"]
cor = cor + 1
print('success')
excel_movie.save("./movie/movie.xlsx") # 保存
if __name__ == '__main__':
num = input('请输入需要爬取的页数(0<num<=10)')
try:
if int(num) <= 0 or int(num) > 10:
print('非法请求')
exit(0)
except:
print('非法请求')
exit(0)
for i in range(int(num)):
url = f"https://movie.douban.com/top250?start={int(i + 1) * 25 - 25}&filter="
index = getindex(url)
time.sleep(3)
print(get_index_title(index))
movie_dir_list = get_movie_dir(index)
out_data_in_excel(movie_dir_list, cor=int(i + 1) * 25 - 23)
微信关注
阅读剩余
版权声明:
作者:理想
链接:https://www.imyjs.cn/archives/630
文章版权归作者所有,未经允许请勿转载。
THE END