哔哩哔哩全站排行榜
哔哩哔哩排行榜:https://www.bilibili.com/v/popular/rank/all
代码实现
import requests import parsel import csv f = open('B站排行榜数据.csv', mode='a', encoding='utf-8-sig', newline='') # 生成B站排行榜数据.csv csv_writer = csv.DictWriter(f, fieldnames=['标题', '播放量', '弹幕量', '作者', '综合得分', '视频地址']) # csv表头 csv_writer.writeheader() url = 'https://www.bilibili.com/v/popular/rank/all' # 爬取页面链接 headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' } response = requests.get(url=url, headers=headers) selector = parsel.Selector(response.text) lis = selector.css('.rank-list li') #class="rank-list"里的所有li dit = {} for li in lis: #标题 title = li.css('.info a::text').get() # class="info"里的<a> #播放量 bf_info = li.css('div.content > div.info > div.detail > span:nth-child(1)::text').get().strip() # <div class="content"> <div class="info"> <div class="detail"> 第一个<span> # 弹幕量 dm_info = li.css('div.content > div.info > div.detail > span:nth-child(2)::text').get().strip() # <div class="content"> <div class="info"> <div class="detail"> 第二个<span> #作者 bq_info = li.css('div.content > div.info > div.detail > a > span::text').get().strip() # <div class="content"> <div class="info"> <div class="detail"> <a>里的<span> # 综合得分 score = li.css('.pts div::text').get() # class="pts"里的<div> # 视频地址 page_url = li.css('.img a::attr(href)').get().replace('//', '') #class="img"里的<a>的"href"属性值 并去除"//" dit = { '标题': title, '播放量': bf_info, '弹幕量': dm_info, '作者': bq_info, '综合得分': score, '视频地址': page_url, } csv_writer.writerow(dit) print(dit)
运行效果
标签解析
豆瓣电影排行榜
豆瓣电影排行榜:https://movie.douban.com/chart
代码实现
#coding="utf-8" import requests import parsel import csv f = open('豆瓣电影排行榜.csv', mode='a', encoding='utf-8-sig', newline='') csv_writer = csv.DictWriter(f,fieldnames=['标题','信息','评分','评价人数','链接']) csv_writer.writeheader() url = 'https://movie.douban.com/chart' headers = { 'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' } response = requests.get(url=url, headers=headers) selector = parsel.Selector(response.text) lis = selector.css('.indent table') # 选择class="indent"里的所有 table dit = {} for li in lis: #标题 MovieTitle = li.css('.pl2 a::text').get().replace('/','').strip() # 选择class="pl2"里的<a> replace('/','') 将"/"替换为空格 strip()清楚两端的空格 #信息 MovieInfo = li.css('.pl2 p::text').get() # 选择class="pl2"里的<p> #评分 MovieScore = li.css('.star.clearfix span.rating_nums::text').get() # 选择class="star clearfix"里的<span class="rating_nums"> #人数 PeoPleNum = li.css('.star.clearfix span.pl::text').get().replace('(','').replace(')','') # 选择class="star clearfix"里的<span class="pl"> 清除两端的"("和")" #链接 MovieUrl = li.css('.pl2 a::attr(href)').get() # 选择class="pl2"里<a>的"href"的属性值 dit = { '标题' : MovieTitle, '信息' : MovieInfo, '评分' : MovieScore+'分', '评价人数' : PeoPleNum, '链接' : MovieUrl, } csv_writer.writerow(dit) print(dit)
运行效果
标签解析
总结
选择器 | 解释 | 示例 |
---|---|---|
.rank-list li | class="rank-list"里的所有<li> | <div class="rank-list"><li></li> </div> |
div.detail > span:nth-child(1) | class="detail"的<div>里的第一个<span> | <div class="detail"><span></span> <span></span><span></span></div> |
.img a::attr(href) | class="img"里<a>的href属性值 | <div class="img"><a href="https://zzy.ink "></a></div> |
本文作者为吾名,转载请注明。