去年氣象局網頁改版為響應式網頁後, 一直沒時間更新爬蟲, 這次使用 python + selenium + beautifulsoup, 使用 webdriver之chromedriver, 讓chrome選擇UI設定後, 從網頁內容爬取資料. 程式先將每個月的彙整表拉出來, 因為新版彙整表資訊不完整, 缺少經緯度等資料, 所以須開啟每一個地震的詳細資料頁, 重新爬一次
# -*- coding: utf-8 -*- """ Created on Sun May 23 14:05:07 2021 @author: ghosty """ from selenium import webdriver from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.keys import Keys from selenium.webdriver.support.ui import Select from bs4 import BeautifulSoup import time
#constant cwbUrl = 'https://scweb.cwb.gov.tw'
# Create Chrome Page options = Options() options.add_argument("--disable-notifications") chrome = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=options )
def getQuakeUrlList(year, month): #open CWB web chrome.get(cwbUrl+'/zh-tw/earthquake/data/') # setup search Month xpath='/html/body/div[1]/div[2]/div/div[2]/div[1]/div[1]/div/h2/input[1]' #xpath of quakelist table searchMonth = chrome.find_element_by_xpath(xpath) chrome.execute_script('arguments[0].removeAttribute(\"readonly\")', searchMonth) chrome.find_element_by_xpath(xpath).clear() searchMonth.send_keys(str(year)+'年'+str(month)+'月') searchMonth.send_keys(Keys.RETURN)
# setup list opion to 'All xpath='/html/body/div[1]/div[2]/div/div[2]/div[1]/div[4]/div/div[1]/div/div[2]/div/label/select' #option xpath tableLength=chrome.find_element_by_xpath(xpath) Select(tableLength).select_by_index(5) #loption is 'All'
# get html contain time.sleep(3) soup = BeautifulSoup(chrome.page_source, 'html.parser')
quakeList = [] quakeTable = soup.find_all('table')[1].find_all('tr') for row in quakeTable[1:]: col = row.find_all('td') quakeList.append(cwbUrl+col[6].find('a')['href']) print(col[0].text,col[1].text, col[2].text, col[3].text, col[4].text, col[5].text,\ '-',col[6].text[:19], col[6].find('a')['href'], col[6].find('a').text.replace(' ', '')) return quakeList
def getQuakeInfo(year, month): quakeUrlList = getQuakeUrlList(year, month) for quakeUrl in quakeUrlList: chrome.get(quakeUrl) time.sleep(2) soup = BeautifulSoup(chrome.page_source, 'html.parser') #xpath='/html/body/div[1]/div[2]/div/div[1]/div[2]/div[3]/ul' # info list #infoList = chrome.find_element_by_xpath(xpath) infoList = soup.find_all('li') for i in range(len(infoList)): #if any(info.text in s for s in ['發震時間','震央位置','地震深度','芮氏規模','相對位置']): if '發震時間' in infoList[i].text: print('-----------------') if (len(infoList[i-1].text.replace('\n','').replace(' ',''))>0): print(infoList[i-1].text.replace('\n','').replace(' ','')) else: print(infoList[i-1].img['alt']) for j in range(5): print(infoList[i+j].text.replace('\n','').replace(' ','')) reportImg = quakeUrl.replace('https://scweb.cwb.gov.tw/zh-tw/earthquake/details/','https://scweb.cwb.gov.tw/zh-tw/earthquake/imgs/') print('地震報告:'+reportImg) break # exit loop i
year = 2020 for month in range(3,13): getQuakeInfo(year, month) |
沒有留言:
張貼留言