博客也写了三年多了,一共用了三个主题,这里记录下碰到的问题以及相关主题优化,防止之后再换主题时碰到相同问题。

最近在给博客迁移主题时,发现阅读排行榜的阅读次数和实际阅读次数不一致,检查后发现原因为阅读排行榜是基于Leancloud,而博客的阅读次数是基于不蒜子,由于不蒜子的阅读次数无法手动修改,所以使两者一致的方法只能是将不蒜子的阅读次数迁移至Leancloud,本文给出迁移的方案以及代码。

参考资料:

依赖库

python库:

pip install beautifulsoup4
pip install tqdm
pip install selenium
pip install fake_useragent

其余软件:

chrome浏览器,chromedriver,根据浏览器选择对应的版本,将下载后的chromedriver路径添加至环境变量。

代码以及基本流程

完整代码(最新版见github):

import requests
import os
import json
import argparse
import datetime
import time

from bs4 import BeautifulSoup
from urllib.parse import unquote
from tqdm import tqdm
from selenium import webdriver
from fake_useragent import UserAgent

# 每个文件中url数量
d = 100
# 保存url的文件夹
url_dir = "url_list"
# 保存data的文件夹
data_dir = "data_list"

def get_max_page(base_url):
	"""
	返回博客文章页数
	"""
	headers = {
        "Content-Type": "application/x-www-form-urlencoded",
        'User-Agent': UserAgent().random #"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Mobile Safari/537.36"
    }
	#response = requests.get(base_url, headers=headers, proxies={"http": "http://111.233.225.166:1234"}).text
	response = requests.get(base_url, headers=headers, proxies={"http": "http://111.233.225.166:1234"}).content
	soup = BeautifulSoup(response, "html.parser")
	max_page = 1
	for page in soup.find_all(class_="page-number"):
		max_page = max(max_page, int(page.text))
	
	return max_page

def get_article_href(page, class_):
	"""
	返回全部文章的超链接
	"""
	headers = {
        "Content-Type": "application/x-www-form-urlencoded",
        'User-Agent': UserAgent().random
    }
	#html_text = requests.get(page, headers=headers, proxies={"http": "http://111.233.225.166:1234"}).
	html_text = requests.get(page, headers=headers, proxies={"http": "http://111.233.225.166:1234"}).content
	soup = BeautifulSoup(html_text, "html.parser")
	hrefs = []
	# 根据主题需要修改
	# for text in soup.find_all(class_="article-title"):
	#for text in soup.find_all(class_="post-title-link"):
	for text in soup.find_all(class_=class_):
		# print(text)
		hrefs.append(text['href'])

	return hrefs

def get_page_pv(driver, url):
	"""
	返回每篇文章的busuanzi阅读量
	"""
	driver.get(url)
	while True:
		p_element = driver.find_element_by_id(id_='busuanzi_value_page_pv')
		if (p_element != ""):
			break
	return p_element.text

def parse_article(driver, href, base_url):
	"""
	解析每篇文章, 获得如下json数据
	{
		"createdAt": "2018-04-07T13:22:14.714Z",
		"time": 109,
		"title": "浙大数据结构Week3",
		"updatedAt": "2021-01-22T15:51:02.024Z",
		"url": "/2018/04/01/浙大数据结构Week3/"
	},
	"""
	headers = {
        "Content-Type": "application/x-www-form-urlencoded",
        'User-Agent': UserAgent().random
    }
	url = base_url + href
	# response = requests.get(url, headers=headers, proxies={"http": "http://111.233.225.166:1234"}).text
	response = requests.get(url, headers=headers, proxies={"http": "http://111.233.225.166:1234"}).content
	
	soup = BeautifulSoup(response, "html.parser")
	data = dict()
	if soup.find(class_="post-meta-date-created") != None:
		data["createdAt"] = soup.find(class_="post-meta-date-created")["datetime"]
	if soup.find(class_="post-meta-date-updated") != None:
		data["updatedAt"] = soup.find(class_="post-meta-date-updated")["datetime"]
	if soup.find(class_="post-title") != None:
		data["title"] = soup.find(class_="post-title").text
	data["url"] = unquote(href)
	cnt = get_page_pv(driver, url)
	if (cnt == ""):
		data["time"] = 1
	else:
		data["time"] = int(cnt)

	return data

def main():
	start = time.time()
	# 获得参数
	parser = argparse.ArgumentParser()
	parser.add_argument(
		"-b", 
		"--base_url", 
		help="博客首页地址"
	)
	parser.add_argument(
		"-c", 
		"--class_name", 
		help="超链接的类名"
	)
	parser.add_argument(
		"-s", 
		"--start", 
		default=0,
		help="从第几批url开始解析, 默认为0"
	)
	args = parser.parse_args()
	# 博客首地址以及总页数
	base_url = args.base_url
	# example: -b "https://doraemonzzz.com"
	# 超链接的类名
	class_ = args.class_name
    # example: -c "article-title"
	articles_batch = []
	# 开始的批次
	s = args.start
	url_path = os.path.join(os.getcwd(), url_dir)

	if os.path.exists(url_dir):
		for file in os.listdir(url_path):
			file_path = os.path.join(url_path, file)
			articles = []
			with open(file_path) as f:
				for data in f.readlines():
					articles.append(data.strip("\n"))
			articles_batch.append(articles)
	else:
		max_page = get_max_page(base_url)
		
		# 获得每页的超链接, 首页为特殊形式
		pages = [base_url] + [f"{base_url}/page/{i}" for i in range(2, max_page + 1)]
		
		# 获得每篇文章的超链接
		articles = []
		for page in tqdm(pages):
			articles += get_article_href(page, class_)
			# time.sleep(3)

		# 保存url, 每100一个url一个文件
		if not os.path.exists(url_dir):
			os.mkdir(url_dir)
		
		n = len(articles)
		m = n // d
		for i in range(m):
			start = i * d
			end = (i + 1) * d
			url_output = os.path.join(url_path, f"{i}.txt")
			with open(url_output, "w") as f:
				for j in range(start, end):
					f.write(articles[j] + "\n")
			articles_batch.append(articles[start: end])
		
		# 处理剩余部分
		if (m * d != n):
			start = m * d
			end = n
			url_output = os.path.join(os.getcwd(), url_dir, f"{m}.txt")
			with open(url_output, "w") as f:
				for j in range(start, end):
					f.write(articles[j] + "\n")
			articles_batch.append(articles[start: end])
	
	# 解析
	# start chrome browser
	options = webdriver.ChromeOptions()
	options.add_argument('headless')
	driver = webdriver.Chrome(options=options)
	data_output = os.path.join(os.getcwd(), data_dir)

	# 建立文件夹
	if not os.path.exists(data_dir):
		os.mkdir(data_dir)

	for i, articles in enumerate(articles_batch):
		if (i < s):
			continue
		output = os.path.join(data_output, f"{i}.json")
		with open(output, 'w', encoding='utf8') as json_file:
			json_file.write("[\n")
			for ariticle in tqdm(articles):
				data = parse_article(driver, ariticle, base_url)
				json.dump(data, json_file, ensure_ascii=False)
				json_file.write(",\n")
			json_file.write("]\n")
		break

	end = time.time()
	print(f"一共花费的时间为{datetime.timedelta(seconds=end-start)}.")

if __name__ == '__main__':
	main()

基本流程

这个任务本身只是一个简单的爬虫,但是因为不蒜子的阅读次数由javascript脚本计算出(爬虫无法直接获取),所以还需要借助selenium以及chromedriver,整个流程如下:

  1. 输入参数

    1. 参数1为博客首页地址url;

      1. 例如https://doraemonzzz.com,注意最后不要带/,例如我的网址不要输入为https://doraemonzzz.com/,否则最后最后解析的结果会不正确。
    2. 参数2为网址超链接对应的class;

      1. 打开博客首页以及开发者工具,选择Elements,然后按ctrl+shift+c,元素选择博客标题,在Elements找到对应的class,例如下图中的class为post-title-link:

    3. 在脚本路径下运行命令:

      python busuanzi_stat.py -b url -c class_
  2. 解析博客页面数量;

  3. 获得每页中文章的地址;

    1. 保存至url_list文件夹中,文件夹中含有多个文件(分别为0.txt, 1.txt, …),每个文件中最多包含100个文章地址,后续解析时按照每个文件分批进行解析,之所以分批是因为一次性解析太多文章会报错,如果在解析i.txt时报错,则运行时添加如下参数即可

      -s i
  4. 获得每篇文章的阅读数以及其余字段,保存至json文件中;

    1. 保存至data_list文件夹中。

    2. 字段样例(建议每个字段包含):

      {
      	"createdAt": "2018-04-07T13:22:14.714Z",
      	"time": 109,
      	"title": "浙大数据结构Week3",
      	"updatedAt": "2021-01-22T15:51:02.024Z",
      	"url": "/2018/04/01/浙大数据结构Week3/"
      }
  5. 在Leancloud中找到阅读统计对应的应用(一般是valine对应的应用),将data_list中json文件导入,其中Class名称填写Counter:

测试

目前自己的博客已经迁移完毕,网上随便搜了一个启用不算子统计的博客,对应的测试代码如下:

python busuanzi_stat.py -b "https://josh-gao.top" -c "post-title-link"