中间断更十几年,忙于事业,最近抽出几天时间把博客的文章导到独立博客上。终于可以不用看平台脸色,也不用讨平台算法喜好,自己随心而记。
主要利用了selenium及wordpress发布服务,也把图片保存了下来。
参数自己配置,代码如下:
from datetime import datetime
from selenium import webdriver
import time
import pandas as pd
import urllib.request
from wordpress_xmlrpc import Client, WordPressPost
from wordpress_xmlrpc.methods.users import GetUserInfo
from wordpress_xmlrpc.methods.posts import GetPosts, NewPost
def goWp(title,content,excerpt,post_tag,category,strptime):
id = "wordpress账号"
password = "wordpress密码"
# 发布地址
url = "https://pankuu.com/xmlrpc.php"
# 新文章要直接发布不用改,发布为草稿的话,参数改为"draft"
which = "publish"
# which="draft"
# 建立客戶端
wp = Client(url, id, password)
# 建立新文章
post = WordPressPost()
post.post_status = which
post.title = title
post.content = content
post.excerpt = excerpt #摘要
post.terms_names = {
"post_tag": post_tag,
"category": category
} #数组 如["tag1", "tag2"]
post.date = datetime.strptime(strptime, "%Y/%m/%d %H:%M:%S") #如果这一篇是过去的文章,可以通过这个方式指定文章发表的日期 格式"2018/1/01 10:05:10"
# 发出去!
wp.call(NewPost(post))
print(title,"——发布成功!")
'''
下载图片
'''
def download_img(img_url):
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'zh-CN,zh;q=0.9',
}
request = urllib.request.Request(img_url, headers=header)
try:
response = urllib.request.urlopen(request)
img_name = img_url.split('/')[-1]
filename = "D:\\盘大叔叔\\百度博客图片内容\\"+ img_name
if (response.getcode() == 200):
with open(filename, "wb") as f:
f.write(response.read()) # 将内容写入图片
return filename
except:
return "failed"
'''
替换文章内容数据
'''
def blog_str(str):
str = str.replace("PP知道网","盘大叔叔").replace("http://www.ppzhidao.com","https://pankuu.com").replace("ppzhidao.com","pankuu.com")
str = str.replace("http://img.baidu.com/hi/jx2/","/wp-content/uploads/blog/").replace("http://hiphotos.baidu.com/zhangchaoxuan/pic/item/","/wp-content/uploads/blog/")
str = str.replace('<div class="pcs-article-content_ptkaiapt4bxy_baiduscarticle" id="detailArticleContent_ptkaiapt4bxy_baiduscarticle">','')
str = str[:-6]
return str
#读取url
try:
file = open('./blog.csv', 'r')
except Exception as e:
print(e)
blog_data=pd.read_csv(file)
urls = []
descs = []
for i in range(len(blog_data)):
urls.append(blog_data.iloc[i,1])
descs.append(blog_data.iloc[i,3])
'''
登录
'''
browser = webdriver.Chrome('D:/chromedriver.exe')
#browser.maximize_window()
browser.get('http://wenzhang.baidu.com/')
#<editor-fold desc="如果需要输入用户名密码">
browser.find_element_by_id('TANGRAM__PSP_3__footerULoginBtn').click() #选择用户名登录
time.sleep(2)
browser.find_element_by_id('TANGRAM__PSP_3__userName').clear() # 选择用户名框
browser.find_element_by_id('TANGRAM__PSP_3__userName').send_keys('这里输入百度账号')
time.sleep(2)
browser.find_element_by_id('TANGRAM__PSP_3__password').clear()
browser.find_element_by_id('TANGRAM__PSP_3__password').send_keys('这里输入百度密码')
time.sleep(2)
browser.find_element_by_id('TANGRAM__PSP_3__submit').click()
#</editor-fold>
for i in range(len(urls)):
time.sleep(5)
browser.get(urls[i]) #打开文章链接
browser.switch_to.frame(0) #切换到文章iframe
try:
#标题
title = browser.find_element_by_class_name('pcs-article-title_ptkaiapt4bxy_baiduscarticle').text
print("==========\n",title,"\n")
#摘要
desc = descs[i]
print("描述:",desc, "\n")
#时间
d = browser.find_element_by_class_name('time-cang')
date = d.text.replace("收藏于 ","")+" 00:00:00"
date = date.replace("-","/")
print(date, "\n")
#内容
c = browser.find_element_by_class_name('pcs-article-content_ptkaiapt4bxy_baiduscarticle')
content = blog_str(c.get_attribute("outerHTML")) #获取标签内的html
print(content, "\n")
#图片
imgs = browser.find_elements_by_tag_name("img")
for j in range(len(imgs)):
download_img(imgs[j].get_attribute("src"))
print("\n 发现图片:",imgs[j].get_attribute("src"))
except Exception as e:
print(e)
#发布文章
goWp(title,content,desc,"博客","其他",date)
browser.close()