bizDataGet

发布于 2021-04-13  108 次阅读


没啥说的,随便贴贴

某选修课,比赛数据在网页上,但是分析又需要在Excel内进行,每一次都逐个复制太笨了,就随便写个脚本把数据爬下来写到Excel里

整个实现逻辑还是有点小乱,不过也懒得改了,反正自用。

以后写之前还是要打好腹稿,变量命名也要考虑好,后期再改变量名真的很痛苦。

话说主页的文章显示还是有点小问题

from selenium import webdriver
import xlwt
import xlrd
from xlutils.copy import copy
import time

userName = "用户名"
psw = "密码" #用户名与密码
logUrl = "http://www.ibizsim.cn/main/login"
dataUrl = "http://www.ibizsim.cn/games/private_report?gameid=你的gameID&teamid=你的队伍ID" # 需要修改为对应值
driver = webdriver.Chrome()

# 登陆流程
driver.get(logUrl)

driver.find_element_by_xpath('//*[@id="text_top"]/div/div/div[2]/form/div[2]/div/input').send_keys(userName)
driver.find_element_by_xpath('//*[@id="text_top"]/div/div/div[2]/form/div[3]/div/input').send_keys(psw) #使用xpath定位并输入账号密码
driver.find_element_by_xpath('//*[@id="text_top"]/div/div/div[2]/form/div[4]/input').click() # 点击登录按钮

# 创建工作簿
fileName = "bizData.xls"
wbk = xlwt.Workbook(encoding='utf-8', style_compression=0)

# 获取报表
driver.get(dataUrl)
tbUl = driver.find_element_by_xpath('//*[@id="mygames_main"]/ul')
# 获取报表列表
tbList = tbUl.find_elements_by_tag_name("li")
i = 1
# 逐一爬取报表
for tb in tbList:
  driver.find_element_by_xpath('//*[@id="mygames_main"]/ul/li[' + str(i) + ']').click()
  time.sleep(1)
  path = tb.find_elements_by_tag_name('a')
  # 创建工作表
  sheet = wbk.add_sheet(tb.text,cell_overwrite_ok=True)
  print(tb.text + " 表已创建,开始爬取...")
  if(path[0].get_attribute('aria-controls')!="private_report_serial"):
        tableTopList = driver.find_element_by_xpath('//*[@id="' + str(path[0].get_attribute('aria-controls')) + '"]/table').find_elements_by_tag_name('th') # 表头内容列表
        # 写入表头到对应表中
        j = 0
        for tableTop in tableTopList:
            sheet.write(0,j,tableTop.text)
            j+=1
        # 逐行写入内容
        trList = driver.find_element_by_xpath('//*[@id="' + str(path[0].get_attribute('aria-controls')) + '"]/table').find_elements_by_tag_name('tr') # 内容列表
        row = 0
        col = 0
        for tr in trList:
            tdList = tr.find_elements_by_tag_name('td')
            for td in tdList:
                sheet.write(row,col,td.text)
                col+=1
            col = 0
            row += 1
        i+=1
  # 追加内容写入
  # 期末产品状况
  if(path[0].get_attribute('aria-controls')=="private_report_product"):
      # 写入表头
      table2_thList = driver.find_element_by_xpath('//*[@id="private_report_product"]/table[2]').find_elements_by_tag_name('th')
      row = 18
      col = 0
      for th in table2_thList:
          sheet.write(row,col,th.text)
          col += 1
      # 写入内容
      table2_trList = driver.find_element_by_xpath('//*[@id="private_report_product"]/table[2]').find_elements_by_tag_name('tr')
      row = 18
      col =0
      for tr in table2_trList:
          table2_tdList = tr.find_elements_by_tag_name('td')
          for td in table2_tdList:
              sheet.write(row,col,td.text)
              col += 1
          row += 1
          col = 0

  # 时间序列数据覆盖写入
  if(path[0].get_attribute('aria-controls')=="private_report_serial"):
      tableList = driver.find_element_by_id('private_report_serial').find_elements_by_tag_name('table')
      tableN = 1
      marketN = 1
      row = 0
      col =0
      for table in tableList:
          #写入标题
          sheet.write(row,col,"产品" + str(tableN) + "  市场" + str(marketN))
          row +=1
          if(marketN == 4):
              marketN = 1
              tableN += 1
          else:
              marketN += 1
          # 写入表头
          table2_thList = table.find_elements_by_tag_name('th')
          for th in table2_thList:
              sheet.write(row,col,th.text)
              col += 1
          col = 0
        #   row += 1
          # 写入内容
          table2_trList = table.find_elements_by_tag_name('tr')
          for tr in table2_trList:
              table2_tdList = tr.find_elements_by_tag_name('td')
              for td in table2_tdList:
                  sheet.write(row,col,td.text)
                  col += 1
              row += 1
              col = 0
          row += 1 # table间间隔一行
  print(tb.text + " 表已爬取完成。")

print("爬取完成,文件生成中...")
wbk.save(fileName)
print("文件生成完毕!")