Hexo博客不蒜子阅读次数迁移至Leancloud
博客也写了三年多了,一共用了三个主题,这里记录下碰到的问题以及相关主题优化,防止之后再换主题时碰到相同问题。
最近在给博客迁移主题时,发现阅读排行榜的阅读次数和实际阅读次数不一致,检查后发现原因为阅读排行榜是基于Leancloud,而博客的阅读次数是基于不蒜子,由于不蒜子的阅读次数无法手动修改,所以使两者一致的方法只能是将不蒜子的阅读次数迁移至Leancloud,本文给出迁移的方案以及代码。
参考资料:
- https://stackoverflow.com/questions/8049520/web-scraping-javascript-page-with-python
- https://stackoverflow.com/questions/50416538/python-phantomjs-says-i-am-not-using-headless/53791690
- https://www.cnblogs.com/lizm166/p/8284131.html
依赖库
python库:
pip install beautifulsoup4
pip install tqdm
pip install selenium
pip install fake_useragent
其余软件:
chrome浏览器,chromedriver,根据浏览器选择对应的版本,将下载后的chromedriver路径添加至环境变量。
代码以及基本流程
完整代码(最新版见github):
import requests
import os
import json
import argparse
import datetime
import time
from bs4 import BeautifulSoup
from urllib.parse import unquote
from tqdm import tqdm
from selenium import webdriver
from fake_useragent import UserAgent
# 每个文件中url数量
d = 100
# 保存url的文件夹
url_dir = "url_list"
# 保存data的文件夹
data_dir = "data_list"
def get_max_page(base_url):
"""
返回博客文章页数
"""
headers = {
"Content-Type": "application/x-www-form-urlencoded",
'User-Agent': UserAgent().random #"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Mobile Safari/537.36"
}
#response = requests.get(base_url, headers=headers, proxies={"http": "http://111.233.225.166:1234"}).text
response = requests.get(base_url, headers=headers, proxies={"http": "http://111.233.225.166:1234"}).content
soup = BeautifulSoup(response, "html.parser")
max_page = 1
for page in soup.find_all(class_="page-number"):
max_page = max(max_page, int(page.text))
return max_page
def get_article_href(page, class_):
"""
返回全部文章的超链接
"""
headers = {
"Content-Type": "application/x-www-form-urlencoded",
'User-Agent': UserAgent().random
}
#html_text = requests.get(page, headers=headers, proxies={"http": "http://111.233.225.166:1234"}).
html_text = requests.get(page, headers=headers, proxies={"http": "http://111.233.225.166:1234"}).content
soup = BeautifulSoup(html_text, "html.parser")
hrefs = []
# 根据主题需要修改
# for text in soup.find_all(class_="article-title"):
#for text in soup.find_all(class_="post-title-link"):
for text in soup.find_all(class_=class_):
# print(text)
hrefs.append(text['href'])
return hrefs
def get_page_pv(driver, url):
"""
返回每篇文章的busuanzi阅读量
"""
driver.get(url)
while True:
p_element = driver.find_element_by_id(id_='busuanzi_value_page_pv')
if (p_element != ""):
break
return p_element.text
def parse_article(driver, href, base_url):
"""
解析每篇文章, 获得如下json数据
{
"createdAt": "2018-04-07T13:22:14.714Z",
"time": 109,
"title": "浙大数据结构Week3",
"updatedAt": "2021-01-22T15:51:02.024Z",
"url": "/2018/04/01/浙大数据结构Week3/"
},
"""
headers = {
"Content-Type": "application/x-www-form-urlencoded",
'User-Agent': UserAgent().random
}
url = base_url + href
# response = requests.get(url, headers=headers, proxies={"http": "http://111.233.225.166:1234"}).text
response = requests.get(url, headers=headers, proxies={"http": "http://111.233.225.166:1234"}).content
soup = BeautifulSoup(response, "html.parser")
data = dict()
if soup.find(class_="post-meta-date-created") != None:
data["createdAt"] = soup.find(class_="post-meta-date-created")["datetime"]
if soup.find(class_="post-meta-date-updated") != None:
data["updatedAt"] = soup.find(class_="post-meta-date-updated")["datetime"]
if soup.find(class_="post-title") != None:
data["title"] = soup.find(class_="post-title").text
data["url"] = unquote(href)
cnt = get_page_pv(driver, url)
if (cnt == ""):
data["time"] = 1
else:
data["time"] = int(cnt)
return data
def main():
start = time.time()
# 获得参数
parser = argparse.ArgumentParser()
parser.add_argument(
"-b",
"--base_url",
help="博客首页地址"
)
parser.add_argument(
"-c",
"--class_name",
help="超链接的类名"
)
parser.add_argument(
"-s",
"--start",
default=0,
help="从第几批url开始解析, 默认为0"
)
args = parser.parse_args()
# 博客首地址以及总页数
base_url = args.base_url
# example: -b "https://doraemonzzz.com"
# 超链接的类名
class_ = args.class_name
# example: -c "article-title"
articles_batch = []
# 开始的批次
s = args.start
url_path = os.path.join(os.getcwd(), url_dir)
if os.path.exists(url_dir):
for file in os.listdir(url_path):
file_path = os.path.join(url_path, file)
articles = []
with open(file_path) as f:
for data in f.readlines():
articles.append(data.strip("\n"))
articles_batch.append(articles)
else:
max_page = get_max_page(base_url)
# 获得每页的超链接, 首页为特殊形式
pages = [base_url] + [f"{base_url}/page/{i}" for i in range(2, max_page + 1)]
# 获得每篇文章的超链接
articles = []
for page in tqdm(pages):
articles += get_article_href(page, class_)
# time.sleep(3)
# 保存url, 每100一个url一个文件
if not os.path.exists(url_dir):
os.mkdir(url_dir)
n = len(articles)
m = n // d
for i in range(m):
start = i * d
end = (i + 1) * d
url_output = os.path.join(url_path, f"{i}.txt")
with open(url_output, "w") as f:
for j in range(start, end):
f.write(articles[j] + "\n")
articles_batch.append(articles[start: end])
# 处理剩余部分
if (m * d != n):
start = m * d
end = n
url_output = os.path.join(os.getcwd(), url_dir, f"{m}.txt")
with open(url_output, "w") as f:
for j in range(start, end):
f.write(articles[j] + "\n")
articles_batch.append(articles[start: end])
# 解析
# start chrome browser
options = webdriver.ChromeOptions()
options.add_argument('headless')
driver = webdriver.Chrome(options=options)
data_output = os.path.join(os.getcwd(), data_dir)
# 建立文件夹
if not os.path.exists(data_dir):
os.mkdir(data_dir)
for i, articles in enumerate(articles_batch):
if (i < s):
continue
output = os.path.join(data_output, f"{i}.json")
with open(output, 'w', encoding='utf8') as json_file:
json_file.write("[\n")
for ariticle in tqdm(articles):
data = parse_article(driver, ariticle, base_url)
json.dump(data, json_file, ensure_ascii=False)
json_file.write(",\n")
json_file.write("]\n")
break
end = time.time()
print(f"一共花费的时间为{datetime.timedelta(seconds=end-start)}.")
if __name__ == '__main__':
main()
基本流程
这个任务本身只是一个简单的爬虫,但是因为不蒜子的阅读次数由javascript脚本计算出(爬虫无法直接获取),所以还需要借助selenium以及chromedriver,整个流程如下:
输入参数
参数1为博客首页地址url;
参数2为网址超链接对应的class;
打开博客首页以及开发者工具,选择Elements,然后按ctrl+shift+c,元素选择博客标题,在Elements找到对应的class,例如下图中的class为post-title-link:
在脚本路径下运行命令:
python busuanzi_stat.py -b url -c class_
解析博客页面数量;
获得每页中文章的地址;
保存至url_list文件夹中,文件夹中含有多个文件(分别为0.txt, 1.txt, …),每个文件中最多包含100个文章地址,后续解析时按照每个文件分批进行解析,之所以分批是因为一次性解析太多文章会报错,如果在解析i.txt时报错,则运行时添加如下参数即可
-s i
获得每篇文章的阅读数以及其余字段,保存至json文件中;
保存至data_list文件夹中。
字段样例(建议每个字段包含):
{ "createdAt": "2018-04-07T13:22:14.714Z", "time": 109, "title": "浙大数据结构Week3", "updatedAt": "2021-01-22T15:51:02.024Z", "url": "/2018/04/01/浙大数据结构Week3/" }
在Leancloud中找到阅读统计对应的应用(一般是valine对应的应用),将data_list中json文件导入,其中Class名称填写Counter:
测试
目前自己的博客已经迁移完毕,网上随便搜了一个启用不算子统计的博客,对应的测试代码如下:
python busuanzi_stat.py -b "https://josh-gao.top" -c "post-title-link"