Python爬取豆瓣电影 Top 250到excel表格!

代码比较简单,直接上code,可作为爬虫入门小demo!需要会一些正则表达式、lxml、requests、openpyxl第三方库的简单使用!代码仅供参考!请勿非法使用!

源码

import requests
import re
from lxml import etree
import os
import time
from openpyxl import Workbook, load_workbook
from openpyxl.styles import Font, colors, Alignment

def getindex(url):
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.37'
    }
    response = requests.get(url, headers=header)
    response.encoding = 'utf-8'
    index = response.text
    response.close()
    return index


def get_index_title(index):
    etree_index = etree.HTML(index)
    url_tiitle = etree_index.xpath('//title/text()')[0].strip()
    # print(url_tiitle)
    return url_tiitle


def get_movie_dir(index):
    r = re.compile(
        r'<div class="item">.*?<a href="(?P<link>.*?)">.*?<span class="title">(?P<name>.*?)'
        r'</span>.*?<br>(?P<year>.*?)&nbsp.*?v:average">(?P<score>.*?)'
        r'</span>.*?<span>(?P<people>.*?)</span>', re.S)

    movie_list = r.finditer(index)
    movie_dir_list = []
    for item in movie_list:
        movie_dir = item.groupdict()
        movie_dir['year'] = movie_dir['year'].strip()
        movie_dir_list.append(movie_dir)
        # print(movie)
    return movie_dir_list


def out_data_in_file(movie_dir_list):
    # if os.path.exists("./movie/"):
    #     os.rmdir("/movie/")
    #     print('success')
    if os.path.exists("./movie/"):
        print('在当前目录下已存在movie目录')
    else:
        try:
            os.makedirs('./movie/')
            print('已在当前目录下创建movie目录')
        except:
            print('movie目录创建失败')
    with open('./movie/movie.txt', mode='w', encoding='utf-8') as f:
        for item_dir in movie_dir_list:
           # print(type(item_dir))
            f.write(f'电影名称:{item_dir["name"]}   电影年份:{item_dir["year"]}  电影得分:{item_dir["score"]}  评分人数:{item_dir["people"]}  豆瓣链接:{item_dir["link"]}\n')


        print('文件已写入并保存')



def out_data_in_excel(movie_dir_list, cor): # movie_dir_list
    if os.path.exists('./movie/movie.xlsx'):
        excel_movie = load_workbook('./movie/movie.xlsx')
    else:
        excel_movie = Workbook()  # 建立一个工作本

    sheet = excel_movie.active  # 激活sheet
    # 第1行行高
    sheet.row_dimensions[1].height = 40
    # C列列宽
    sheet.column_dimensions['A'].width = 20
    sheet.column_dimensions['D'].width = 20
    sheet.title = 'movie'  # 对sheet进行命名
    title = ['电影名称', '电影年份', '电影得分', '评分人数', '豆瓣链接']
    for index, item in enumerate(title):
        sheet.cell(1, index + 1).value = item
    for item_dir in movie_dir_list:
        sheet.cell(cor, 1).value = item_dir["name"]
        sheet.cell(cor, 2).value = item_dir["year"]
        sheet.cell(cor, 3).value = item_dir["score"]
        sheet.cell(cor, 4).value = item_dir["people"]
        sheet.cell(cor, 5).value = item_dir["link"]
        cor = cor + 1
    print('success')
    excel_movie.save("./movie/movie.xlsx")  # 保存





if __name__ == '__main__':

    num = input('请输入需要爬取的页数(0<num<=10)')
    try:
        if int(num) <= 0 or int(num) > 10:
            print('非法请求')
            exit(0)
    except:
        print('非法请求')
        exit(0)

    for i in range(int(num)):
        url = f"https://movie.douban.com/top250?start={int(i + 1) * 25 - 25}&filter="
        index = getindex(url)
        time.sleep(3)
        print(get_index_title(index))
        movie_dir_list = get_movie_dir(index)
        out_data_in_excel(movie_dir_list, cor=int(i + 1) * 25 - 23)

 

微信关注

WeChat

阅读剩余
THE END