程序猿 程序猿

用python脚本删除QQ空间说说

in 技术教程 阅读[820] ShiKun 文章转载请注明来源!

上一篇文章说到了用python把QQ空间的说说爬取下来,用python爬取QQ空间说说

今天,我要用python把QQ空间所有的说说都删除掉。动机就不说了。

整体的步骤跟爬取说说的步骤差不多。

算了,不想写,直接上传代码吧。遇到了一个问题,删除一百多条说说后,腾讯就要验证码了,验证码识别太麻烦了,就不弄了。

源文件:clear_qzone.py

#coding=utf-8
#导入selenium2中的webdriver库
from selenium.webdriver.support.select import Select
from selenium import webdriver
from selenium.webdriver.common.keys import Keys     
from selenium.webdriver.common.action_chains import ActionChains   
from bs4 import BeautifulSoup
from PIL import Image
from PIL import ImageOps
import time
import re
import lxml
import sys
import subprocess
import urllib

fp = open('log.txt', 'a')

def get_time():
    return time.strftime('[%Y-%m-%d %H:%M:%S] ',time.localtime(time.time()))

#要爬取的QQ空间
qq = '123456789'
#登录的QQ号
myqq = '123456789'
#登录QQ密码
passwd = '123456'



def quit(driver):
    with open('flag.txt', 'w') as f:
        f.write('False')

    fp.close()
    driver.quit()
    sys.exit()


def login():
    #实例化出一个Firefox浏览器
    option=webdriver.ChromeOptions()
    option.add_argument("test-type")
    driver = webdriver.Chrome(chrome_options=option)
    #设置浏览器窗口的位置和大小
    driver.set_window_position(20, 40)
    driver.set_window_size(1100,700)
    #driver.maximize_window()

    #打开一个页面(QQ空间登录页)
    driver.get('http://user.qzone.qq.com/%s/311'%qq)
    #登录表单在页面的框架中,所以要切换到该框架
    time.sleep(1)
    try:
        driver.switch_to_frame('login_frame')
        #通过使用选择器选择到表单元素进行模拟输入和点击按钮提交
        driver.find_element_by_id('switcher_plogin').click()
        driver.find_element_by_id('u').clear()
        driver.find_element_by_id('u').send_keys(myqq)
        driver.find_element_by_id('p').clear()
        driver.find_element_by_id('p').send_keys(passwd)
        driver.find_element_by_id('login_button').click()
        time.sleep(3)
        driver.switch_to_frame('app_canvas_frame')
        time.sleep(1)
    except:
        fp.write(get_time() + '出师不利,登录失败\n')
        quit(driver)
    return driver

def cleanImage(imagePath):
    image = Image.open(imagePath)   #打开图片
    image = image.point(lambda x: 0 if x<143 else 255)  #处理图片上的每个像素点,使图片上每个点“非黑即白”
    borderImage = ImageOps.expand(image,border=20,fill='white')
    borderImage.save(imagePath)


def getAuthCode(driver, url):
    captchaUrl = url
    #driver.get(captchaUrl)  
    time.sleep(0.5)
    #driver.save_screenshot("captcha.jpg")   #截屏,并保存图片
    img_name = 'captcha.png'
    urllib.urlretrieve(captchaUrl, img_name)
    time.sleep(0.5)
    cleanImage(img_name)
    p = subprocess.Popen(["tesseract", img_name, "captcha", "-psm 7"], stdout=subprocess.PIPE,stderr=subprocess.PIPE)
    p.wait()
    try:
        with open("captcha.txt", "r") as f:
            captchaResponse = f.read().replace(" ", "").replace("\n", "")

            print "验证码: " ,captchaResponse, '长度:', len(captchaResponse),

            if len(captchaResponse) == 4:
                return captchaResponse
            else:
                return False
    except IOError:
        return False




def get_page(driver):
    #f.write '获取html源码'
    return driver.page_source

def delete_shuoshuo(bd, driver):
    #获取说说文字部分
    if bd is None:
        return 

    item = bd.find('a', class_='c_tx c_tx3 goDetail').get('title')
    print item,

    more = driver.find_element_by_css_selector("[class='dropdown more-edit-items']")
    driver.execute_script("""
        var element = arguments[0];
        element.setAttribute('class', 'dropdown more-edit-items dropdown-open');
        """, more)
    time.sleep(1)
    #delete = driver.find_element_by_css_selector("[class='del del_btn author_display']")
    try:
        delete = driver.find_element_by_link_text('删除')
        #print 'delete:',delete
        if delete is None:
            print 'delete is None'
            return

        delete.click()
    except:
        fp.write(get_time() + '无法获取删除按钮\n')
        quit(driver)    

    time.sleep(1)
    driver.switch_to_default_content()
    try:
        yes = driver.find_element_by_link_text('是')

        if yes is None:
            print 'yes is None'
            return
        yes.click() 
    except:
        fp.write(get_time() + '无法获取删除对话框\n')
        quit(driver)

    #判断是否需要输入验证码
    #try:
    time.sleep(1)
    try:
        driver.switch_to_frame('verify_dialog_frame')
    except:
        print '不需要验证码 删除成功'
        fp.write(get_time() + item + '删除成功\n')
        driver.switch_to_frame('app_canvas_frame')
        return

    print '需要验证码',
    fp.write(get_time() + '需要验证码,暂时退出\n')
    quit(driver)

    img_page = get_page(driver)
    soup = BeautifulSoup(img_page,'lxml')
    time.sleep(1)

    img = soup.find('div', id='web_verify').find('img', id='verifyImg').get('src')
    if img is None:
        print '无法获取验证码地址',
        driver.find_element_by_css_selector("[class='spr bt_tip_normal']").click()
        driver.switch_to_frame('app_canvas_frame')
        return

    #print img
    code = getAuthCode(driver, img)

    if code:
        driver.find_element_by_id('verifyInput').clear()
        driver.find_element_by_id('verifyInput').send_keys(code)
        driver.find_element_by_css_selector("[class='spr bt_tip_over']").click()
        print '删除成功'
    else:
        print '无法识别验证码'
        driver.find_element_by_css_selector("[class='spr bt_tip_normal']").click()

    time.sleep(0.5)
    driver.switch_to_default_content()
    time.sleep(0.5)
    driver.switch_to_frame('app_canvas_frame')
    
    return

if __name__ == '__main__':
    with open('flag.txt', 'r') as f:
        if f.readline() == 'True':
            fp.write(get_time() + '另一个程序正在运行\n')
            fp.close()
            sys.exit()
        
    with open('flag.txt', 'w') as f:
        f.write('True')

    driver = login()
    try:
        target = driver.find_element_by_id("pager_next_0")
        driver.execute_script("arguments[0].scrollIntoView();", target)
        driver.find_element_by_id('pager_last_0').click()
        time.sleep(2)
        pages = get_page(driver) 
        soup = BeautifulSoup(pages,'lxml')

        total_page = soup.find('span', class_='current').get_text()
        ol = soup.find('ol', id='msgList')
    except:
        fp.write(get_time() + '页面加载失败,退出\n')
        quit(driver)

    page = 1
    while True:
        msgList =  ol.find_all('li', class_='feed')
        print 'page: ' + str(page) + ' msgList: ' + str(len(msgList))

        for post in msgList:
            #调用获取说说文字部分函数
            delete_shuoshuo(post.find('div', class_='ft'), driver)
            time.sleep(1)
        
        #获取当前页面,。如果是最后一页,则结束while循环
        try:
            cur_page = soup.find('span', class_='current').find('span').get_text()
            if int(cur_page) == 1:#total_page:
                fp.write(get_time() + '删除完成\n')
                driver.quit()
                sys.exit()
        except:
            fp.write(get_time() + '获取当前页码失败\n')
            quit(driver)

        #否则进行翻页操作
        try:
            driver.find_element_by_id('pager_previous_%d'%(page)).click()
            time.sleep(2)

            pages = get_page(driver) 
            soup = BeautifulSoup(pages,'lxml')
            ol = soup.find('ol', id='msgList')
        except:
            fp.write(get_time() + '翻页失败%d\n'%page)
            quit(driver)

        page += 1

        time.sleep(1)

    #退出窗口
    quit(driver)
pythonbs4selenium
最后由ShiKun修改于2017-10-08 22:41
发表新评论
雷姆
拉姆