Python 爬取某导航网全部数据!
import requests
import re
import time
import csv
domain = "http://dh.imyjs.cn"
resp = requests.get(domain)
html = resp.text
# <a class="link-tooltip" title="http://movie.imyjs.cn/" href="../site-32.html">
# <div class="card">
# <div class="card-heading">
# <span class="card-icon"><img src="http://movie.imyjs.cn/favicon.ico" onerror="javascript:this.src='../img/default_ico.png';"></span>
# <span class="card-title">理想智能云解析</span>
# </div>
# <div class="card-body">理想智能解析为您免费解析主流视频网站的VIP视频,支持爱奇艺、腾讯、优酷、乐视、芒果、搜狐、PPTV等等,可搜索最新的免费福利视频、电影和电视剧资源,欢迎使用!</div>
# <div class="card-footer">
# <div class="view"><i class="fa fa-eye fa-fw" aria-hidden="true"></i> 240</div>
# <div class="love love-btn" rel="32" name="love"><i class="fa fa-heart-o fa-fw" aria-hidden="true"></i> 12</div>
# </div>
# </div>
# </a>
# <span class="title"><i class="fa fa fa-diamond fa-fw" aria-hidden="true"></i> 我的网站</span>
# <span class="more"><a href="../sort/mysite.html">更多</a></span>
pattern = re.compile('</a>.*?<span class="title">.*?</i>(?P<classname>.*?)</span>.*?<a href="(?P<url>.*?)"', re.S)
rs = pattern.finditer(html)
resp.close()
classnum = 0
booknum = 0
bookmark = []
for item in rs:
classname = item.group(1).strip("")
child_url = domain + item.group(2).split("..")[1]
if classnum != 0: # 排除第一次匹配项 排行榜
# print(classname)
# print(child_url)
child_resp = requests.get(child_url)
child_html = child_resp.text
# print(child_html)
pattern2 = re.compile(
'<a class="link-tooltip" title="(?P<link>.*?)" href.*?<span class="card-icon"><img src="(?P<icon>.*?)" onerror.*?<span class="card-title">(?P<title>.*?)</span>.*?'
'<div class="card-body">(?P<desc>.*?)</div>', re.S)
rs2 = pattern2.finditer(child_html)
for msg in rs2:
link = msg.group(1)
icon = msg.group(2)
title = msg.group(3)
desc = msg.group(4)
# print("*" * 50)
# print(classname)
# print(link)
# print(icon)
# print(title)
# print(desc)
bookmark1 = classname + " ," + title + " ," + link + " ," + icon + " ," + desc
print(bookmark1)
bookmark.append(bookmark1)
booknum += 1
child_resp.close()
time.sleep(2)
classnum += 1
print(f"共计分类:{classnum - 1}个,书签:{booknum - 1}个。\n 学习更多技术欢迎关注微信公众号:编程那点事儿")
with open('./bookmark.csv', 'w', encoding='utf-8') as f:
writer = csv.writer(f)
for row in bookmark:
writer.writerow(row)
微信关注
阅读剩余
版权声明:
作者:理想
链接:https://www.imyjs.cn/archives/462
文章版权归作者所有,未经允许请勿转载。
THE END