CNUGet

发布于 2021-04-14  34 次阅读


爬取某网站的图片

先登录账号,然后逐个爬取点赞作品中的图片,并分目录保存

整个过程还是挺顺利的,只是页面元素要下拉到底才能完全加载,这也是程序用时最多的步骤。

博客主页显示的部分文章,会把代码行号也显示出来,有点怪

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import requests
import os
import re
import time
import random

# 无头参数
options = Options()
options.add_argument("--headless")

# 头信息
# user_agent = (
#     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36"
# )

# options.add_argument('user-agent=%s'%user_agent)

# 下拉页面函数
def pulldown(driver):
    t = True
    i = 1
    while t:
        check_height = driver.execute_script("return document.body.scrollHeight;")
        for r in range(20):
            t = random.uniform(1, 2)
            time.sleep(t)
            driver.execute_script("window.scrollBy(0,1000)")
            # print('第%s页' % str(i))
            i += 1
        check_height1 = driver.execute_script("return document.body.scrollHeight;")
        # print(str(check_height)+'**************'+str(check_height1))
        if check_height == check_height1:
            t = False

def loadAll(driver):
    time.sleep(1)
    for i in range(10):
        # x管水平,y管垂直
        js = 'window.scrollTo(0,%s)'%(i*10000)
        driver.execute_script(js)
        time.sleep(0.05)
    time.sleep(1)

# 文件夹创建函数
def mkdir(fileName):
    isExists = os.path.exists(fileName)
    if not isExists: # 该目录不存在,创建
        os.mkdir(fileName)
        print(fileName + "创建成功")
        return True
    else: # 该目录存在,返回false
        print(fileName + "已存在")
        return False

# 图片保存函数
def PicGet(picUrl,picName):
    r = requests.get(picUrl)
    with open(picName,'wb') as f:
        f.write(r.content)

# 图片url列表获取函数
def picUrlget(workUrl):
    driver = webdriver.Chrome(options=options) # 
    driver.get(workUrl)
    # loadAll(driver)
    pulldown(driver)
    # listBody = driver.find_element_by_xpath('//*[@id="work_body"]').find_elements_by_tag_name('img')
    listBody = driver.find_element_by_xpath('//*[@id="work_body"]').find_elements_by_xpath('//*[@class="bodyImg lazy"]')
    picUrlList = []
    workTitle = driver.find_element_by_xpath('/html/body/div[1]/div[2]/h2').text
    picUrlList.append(workTitle)
    pattern = 'http.*(?=\?)' # (?=\?)
    for picPath in listBody:
        picUrl = re.match(pattern,picPath.get_attribute('src'))
        if picUrl.group()!= None:
            picUrlList.append(picUrl.group())
    driver.quit()
    return picUrlList

# 获取点赞作品的url列表
def workListGet():
    driver = webdriver.Chrome(options=options)
    driver.get("http://www.cnu.cc/login")
    driver.find_element_by_xpath('//*[@id="email"]').send_keys("你的邮箱")
    driver.find_element_by_xpath('//*[@id="password"]').send_keys("你的密码")
    driver.find_element_by_xpath('/html/body/div[2]/div/form/div[3]/button').click()
    driver.get("http://www.cnu.cc/users/recommended/1199240")
    # loadAll(driver)
    pulldown(driver)
    workList = []
    listBody = driver.find_element_by_xpath('//*[@id="recommendForm"]').find_elements_by_tag_name('a')
    for work in listBody:
        workList.append(work.get_attribute('href'))
    driver.quit()
    return workList

# 开始主程序
# 获取点赞作品列表
workList = workListGet()
#逐个保存作品
for work in workList:
    picUrlList = picUrlget(work)
    print("开始下载" + picUrlList[0])
    i = 0
    for picUrl in picUrlList: # 首项为作品标题
        if i == 0:
            print("生成目录:" + picUrlList[0])
            mkdir(picUrlList[0])
            i += 1
            continue
        else:
            print("开始保存第" + str(i) + "张图片")
            PicGet(picUrl,'./' + picUrlList[0] + '/' + str(i) + '.jpg')
            print("第" + str(i) + "张图片保存完毕")
            i += 1