爬取某网站的图片
先登录账号,然后逐个爬取点赞作品中的图片,并分目录保存
整个过程还是挺顺利的,只是页面元素要下拉到底才能完全加载,这也是程序用时最多的步骤。
博客主页显示的部分文章,会把代码行号也显示出来,有点怪
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import requests
import os
import re
import time
import random
# 无头参数
options = Options()
options.add_argument("--headless")
# 头信息
# user_agent = (
# "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36"
# )
# options.add_argument('user-agent=%s'%user_agent)
# 下拉页面函数
def pulldown(driver):
t = True
i = 1
while t:
check_height = driver.execute_script("return document.body.scrollHeight;")
for r in range(20):
t = random.uniform(1, 2)
time.sleep(t)
driver.execute_script("window.scrollBy(0,1000)")
# print('第%s页' % str(i))
i += 1
check_height1 = driver.execute_script("return document.body.scrollHeight;")
# print(str(check_height)+'**************'+str(check_height1))
if check_height == check_height1:
t = False
def loadAll(driver):
time.sleep(1)
for i in range(10):
# x管水平,y管垂直
js = 'window.scrollTo(0,%s)'%(i*10000)
driver.execute_script(js)
time.sleep(0.05)
time.sleep(1)
# 文件夹创建函数
def mkdir(fileName):
isExists = os.path.exists(fileName)
if not isExists: # 该目录不存在,创建
os.mkdir(fileName)
print(fileName + "创建成功")
return True
else: # 该目录存在,返回false
print(fileName + "已存在")
return False
# 图片保存函数
def PicGet(picUrl,picName):
r = requests.get(picUrl)
with open(picName,'wb') as f:
f.write(r.content)
# 图片url列表获取函数
def picUrlget(workUrl):
driver = webdriver.Chrome(options=options) #
driver.get(workUrl)
# loadAll(driver)
pulldown(driver)
# listBody = driver.find_element_by_xpath('//*[@id="work_body"]').find_elements_by_tag_name('img')
listBody = driver.find_element_by_xpath('//*[@id="work_body"]').find_elements_by_xpath('//*[@class="bodyImg lazy"]')
picUrlList = []
workTitle = driver.find_element_by_xpath('/html/body/div[1]/div[2]/h2').text
picUrlList.append(workTitle)
pattern = 'http.*(?=\?)' # (?=\?)
for picPath in listBody:
picUrl = re.match(pattern,picPath.get_attribute('src'))
if picUrl.group()!= None:
picUrlList.append(picUrl.group())
driver.quit()
return picUrlList
# 获取点赞作品的url列表
def workListGet():
driver = webdriver.Chrome(options=options)
driver.get("http://www.cnu.cc/login")
driver.find_element_by_xpath('//*[@id="email"]').send_keys("你的邮箱")
driver.find_element_by_xpath('//*[@id="password"]').send_keys("你的密码")
driver.find_element_by_xpath('/html/body/div[2]/div/form/div[3]/button').click()
driver.get("http://www.cnu.cc/users/recommended/1199240")
# loadAll(driver)
pulldown(driver)
workList = []
listBody = driver.find_element_by_xpath('//*[@id="recommendForm"]').find_elements_by_tag_name('a')
for work in listBody:
workList.append(work.get_attribute('href'))
driver.quit()
return workList
# 开始主程序
# 获取点赞作品列表
workList = workListGet()
#逐个保存作品
for work in workList:
picUrlList = picUrlget(work)
print("开始下载" + picUrlList[0])
i = 0
for picUrl in picUrlList: # 首项为作品标题
if i == 0:
print("生成目录:" + picUrlList[0])
mkdir(picUrlList[0])
i += 1
continue
else:
print("开始保存第" + str(i) + "张图片")
PicGet(picUrl,'./' + picUrlList[0] + '/' + str(i) + '.jpg')
print("第" + str(i) + "张图片保存完毕")
i += 1
Comments NOTHING