Python 爬取某导航网全部数据!

import requests
import re
import time
import csv

domain = "http://dh.imyjs.cn"
resp = requests.get(domain)
html = resp.text

# <a class="link-tooltip" title="http://movie.imyjs.cn/" href="../site-32.html">
#                     <div class="card">
#                         <div class="card-heading">
#                             <span class="card-icon"><img src="http://movie.imyjs.cn/favicon.ico" onerror="javascript:this.src='../img/default_ico.png';"></span>
#                             <span class="card-title">理想智能云解析</span>
#                         </div>
#                         <div class="card-body">理想智能解析为您免费解析主流视频网站的VIP视频,支持爱奇艺、腾讯、优酷、乐视、芒果、搜狐、PPTV等等,可搜索最新的免费福利视频、电影和电视剧资源,欢迎使用!</div>
#                         <div class="card-footer">
#                         	<div class="view"><i class="fa fa-eye fa-fw" aria-hidden="true"></i> 240</div>
#                         	<div class="love love-btn" rel="32" name="love"><i class="fa fa-heart-o fa-fw" aria-hidden="true"></i> 12</div>
# 						</div>
#                 	</div>
# </a>
# <span class="title"><i class="fa fa fa-diamond fa-fw" aria-hidden="true"></i> 我的网站</span>
# <span class="more"><a href="../sort/mysite.html">更多</a></span>


pattern = re.compile('</a>.*?<span class="title">.*?</i>(?P<classname>.*?)</span>.*?<a href="(?P<url>.*?)"', re.S)
rs = pattern.finditer(html)
resp.close()

classnum = 0
booknum = 0
bookmark = []
for item in rs:
    classname = item.group(1).strip("")
    child_url = domain + item.group(2).split("..")[1]
    if classnum != 0:  # 排除第一次匹配项 排行榜
        # print(classname)
        # print(child_url)
        child_resp = requests.get(child_url)
        child_html = child_resp.text
        # print(child_html)
        pattern2 = re.compile(
            '<a class="link-tooltip" title="(?P<link>.*?)" href.*?<span class="card-icon"><img src="(?P<icon>.*?)" onerror.*?<span class="card-title">(?P<title>.*?)</span>.*?'
            '<div class="card-body">(?P<desc>.*?)</div>', re.S)
        rs2 = pattern2.finditer(child_html)
        for msg in rs2:
            link = msg.group(1)
            icon = msg.group(2)
            title = msg.group(3)
            desc = msg.group(4)
            # print("*" * 50)
            # print(classname)
            # print(link)
            # print(icon)
            # print(title)
            # print(desc)
            bookmark1 = classname + "  ," + title + "  ," + link + "  ," + icon + "  ," + desc
            print(bookmark1)
            bookmark.append(bookmark1)
            booknum += 1
        child_resp.close()
        time.sleep(2)
    classnum += 1
print(f"共计分类:{classnum - 1}个,书签:{booknum - 1}个。\n 学习更多技术欢迎关注微信公众号:编程那点事儿")

with open('./bookmark.csv', 'w', encoding='utf-8') as f:
    writer = csv.writer(f)
    for row in bookmark:
        writer.writerow(row)

 

 

微信关注

WeChat

阅读剩余
THE END