没啥说的,随便贴贴
某选修课,比赛数据在网页上,但是分析又需要在Excel内进行,每一次都逐个复制太笨了,就随便写个脚本把数据爬下来写到Excel里
整个实现逻辑还是有点小乱,不过也懒得改了,反正自用。
以后写之前还是要打好腹稿,变量命名也要考虑好,后期再改变量名真的很痛苦。
话说主页的文章显示还是有点小问题
from selenium import webdriver
import xlwt
import xlrd
from xlutils.copy import copy
import time
userName = "用户名"
psw = "密码" #用户名与密码
logUrl = "http://www.ibizsim.cn/main/login"
dataUrl = "http://www.ibizsim.cn/games/private_report?gameid=你的gameID&teamid=你的队伍ID" # 需要修改为对应值
driver = webdriver.Chrome()
# 登陆流程
driver.get(logUrl)
driver.find_element_by_xpath('//*[@id="text_top"]/div/div/div[2]/form/div[2]/div/input').send_keys(userName)
driver.find_element_by_xpath('//*[@id="text_top"]/div/div/div[2]/form/div[3]/div/input').send_keys(psw) #使用xpath定位并输入账号密码
driver.find_element_by_xpath('//*[@id="text_top"]/div/div/div[2]/form/div[4]/input').click() # 点击登录按钮
# 创建工作簿
fileName = "bizData.xls"
wbk = xlwt.Workbook(encoding='utf-8', style_compression=0)
# 获取报表
driver.get(dataUrl)
tbUl = driver.find_element_by_xpath('//*[@id="mygames_main"]/ul')
# 获取报表列表
tbList = tbUl.find_elements_by_tag_name("li")
i = 1
# 逐一爬取报表
for tb in tbList:
driver.find_element_by_xpath('//*[@id="mygames_main"]/ul/li[' + str(i) + ']').click()
time.sleep(1)
path = tb.find_elements_by_tag_name('a')
# 创建工作表
sheet = wbk.add_sheet(tb.text,cell_overwrite_ok=True)
print(tb.text + " 表已创建,开始爬取...")
if(path[0].get_attribute('aria-controls')!="private_report_serial"):
tableTopList = driver.find_element_by_xpath('//*[@id="' + str(path[0].get_attribute('aria-controls')) + '"]/table').find_elements_by_tag_name('th') # 表头内容列表
# 写入表头到对应表中
j = 0
for tableTop in tableTopList:
sheet.write(0,j,tableTop.text)
j+=1
# 逐行写入内容
trList = driver.find_element_by_xpath('//*[@id="' + str(path[0].get_attribute('aria-controls')) + '"]/table').find_elements_by_tag_name('tr') # 内容列表
row = 0
col = 0
for tr in trList:
tdList = tr.find_elements_by_tag_name('td')
for td in tdList:
sheet.write(row,col,td.text)
col+=1
col = 0
row += 1
i+=1
# 追加内容写入
# 期末产品状况
if(path[0].get_attribute('aria-controls')=="private_report_product"):
# 写入表头
table2_thList = driver.find_element_by_xpath('//*[@id="private_report_product"]/table[2]').find_elements_by_tag_name('th')
row = 18
col = 0
for th in table2_thList:
sheet.write(row,col,th.text)
col += 1
# 写入内容
table2_trList = driver.find_element_by_xpath('//*[@id="private_report_product"]/table[2]').find_elements_by_tag_name('tr')
row = 18
col =0
for tr in table2_trList:
table2_tdList = tr.find_elements_by_tag_name('td')
for td in table2_tdList:
sheet.write(row,col,td.text)
col += 1
row += 1
col = 0
# 时间序列数据覆盖写入
if(path[0].get_attribute('aria-controls')=="private_report_serial"):
tableList = driver.find_element_by_id('private_report_serial').find_elements_by_tag_name('table')
tableN = 1
marketN = 1
row = 0
col =0
for table in tableList:
#写入标题
sheet.write(row,col,"产品" + str(tableN) + " 市场" + str(marketN))
row +=1
if(marketN == 4):
marketN = 1
tableN += 1
else:
marketN += 1
# 写入表头
table2_thList = table.find_elements_by_tag_name('th')
for th in table2_thList:
sheet.write(row,col,th.text)
col += 1
col = 0
# row += 1
# 写入内容
table2_trList = table.find_elements_by_tag_name('tr')
for tr in table2_trList:
table2_tdList = tr.find_elements_by_tag_name('td')
for td in table2_tdList:
sheet.write(row,col,td.text)
col += 1
row += 1
col = 0
row += 1 # table间间隔一行
print(tb.text + " 表已爬取完成。")
print("爬取完成,文件生成中...")
wbk.save(fileName)
print("文件生成完毕!")
Comments NOTHING