盘大叔叔logo

用Python把十几年前的百度博客文章全部搬到独立博客了

2021年06月01日     / 0评 / 1

中间断更十几年,忙于事业,最近抽出几天时间把博客的文章导到独立博客上。终于可以不用看平台脸色,也不用讨平台算法喜好,自己随心而记。

主要利用了selenium及wordpress发布服务,也把图片保存了下来。

参数自己配置,代码如下:

from datetime import datetime
from selenium import webdriver
import time
import pandas as pd
import urllib.request
from wordpress_xmlrpc import Client, WordPressPost
from wordpress_xmlrpc.methods.users import GetUserInfo
from wordpress_xmlrpc.methods.posts import GetPosts, NewPost


def goWp(title,content,excerpt,post_tag,category,strptime):
    id = "wordpress账号"
    password = "wordpress密码"
    # 发布地址
    url = "https://pankuu.com/xmlrpc.php"
    # 新文章要直接发布不用改,发布为草稿的话,参数改为"draft"
    which = "publish"
    # which="draft"

    # 建立客戶端
    wp = Client(url, id, password)
    # 建立新文章
    post = WordPressPost()
    post.post_status = which
    post.title = title
    post.content = content
    post.excerpt = excerpt  #摘要
    post.terms_names = {
        "post_tag": post_tag,
        "category": category
    } #数组 如["tag1", "tag2"]
    post.date = datetime.strptime(strptime, "%Y/%m/%d %H:%M:%S")  #如果这一篇是过去的文章,可以通过这个方式指定文章发表的日期 格式"2018/1/01 10:05:10"
    # 发出去!
    wp.call(NewPost(post))
    print(title,"——发布成功!")


'''
下载图片
'''
def download_img(img_url):
    header = {
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'accept-language': 'zh-CN,zh;q=0.9',
    }
    request = urllib.request.Request(img_url, headers=header)
    try:
        response = urllib.request.urlopen(request)
        img_name = img_url.split('/')[-1]
        filename = "D:\\盘大叔叔\\百度博客图片内容\\"+ img_name
        if (response.getcode() == 200):
            with open(filename, "wb") as f:
                f.write(response.read()) # 将内容写入图片
            return filename
    except:
        return "failed"

'''
替换文章内容数据
'''
def blog_str(str):
    str = str.replace("PP知道网","盘大叔叔").replace("http://www.ppzhidao.com","https://pankuu.com").replace("ppzhidao.com","pankuu.com")
    str = str.replace("http://img.baidu.com/hi/jx2/","/wp-content/uploads/blog/").replace("http://hiphotos.baidu.com/zhangchaoxuan/pic/item/","/wp-content/uploads/blog/")
    str = str.replace('<div class="pcs-article-content_ptkaiapt4bxy_baiduscarticle" id="detailArticleContent_ptkaiapt4bxy_baiduscarticle">','')
    str = str[:-6]
    return str


#读取url
try:
    file =  open('./blog.csv', 'r')
except Exception as e:
    print(e)
blog_data=pd.read_csv(file)

urls = []
descs = []
for i in range(len(blog_data)):
    urls.append(blog_data.iloc[i,1])
    descs.append(blog_data.iloc[i,3])

'''
登录
'''

browser = webdriver.Chrome('D:/chromedriver.exe')
#browser.maximize_window()
browser.get('http://wenzhang.baidu.com/')

#<editor-fold desc="如果需要输入用户名密码">
browser.find_element_by_id('TANGRAM__PSP_3__footerULoginBtn').click()   #选择用户名登录
time.sleep(2)
browser.find_element_by_id('TANGRAM__PSP_3__userName').clear()  # 选择用户名框
browser.find_element_by_id('TANGRAM__PSP_3__userName').send_keys('这里输入百度账号')
time.sleep(2)
browser.find_element_by_id('TANGRAM__PSP_3__password').clear()
browser.find_element_by_id('TANGRAM__PSP_3__password').send_keys('这里输入百度密码')
time.sleep(2)
browser.find_element_by_id('TANGRAM__PSP_3__submit').click()
#</editor-fold>

for i in range(len(urls)):
    time.sleep(5)
    browser.get(urls[i])    #打开文章链接
    browser.switch_to.frame(0)  #切换到文章iframe
    try:
        #标题
        title = browser.find_element_by_class_name('pcs-article-title_ptkaiapt4bxy_baiduscarticle').text
        print("==========\n",title,"\n")
        #摘要
        desc = descs[i]
        print("描述:",desc, "\n")
        #时间
        d = browser.find_element_by_class_name('time-cang')
        date = d.text.replace("收藏于 ","")+" 00:00:00"
        date = date.replace("-","/")
        print(date, "\n")
        #内容
        c = browser.find_element_by_class_name('pcs-article-content_ptkaiapt4bxy_baiduscarticle')
        content = blog_str(c.get_attribute("outerHTML"))        #获取标签内的html
        print(content, "\n")
        #图片
        imgs = browser.find_elements_by_tag_name("img")
        for j in range(len(imgs)):
            download_img(imgs[j].get_attribute("src"))
            print("\n 发现图片:",imgs[j].get_attribute("src"))
    except Exception as e:
        print(e)
    #发布文章
    goWp(title,content,desc,"博客","其他",date)

browser.close()

发表回复

您的电子邮箱地址不会被公开。 必填项已用*标注

鲁ICP备2021023915号