No More Codes: [Python] 從中央氣象局下載地震活動彙整列表

去年氣象局網頁改版為響應式網頁後, 一直沒時間更新爬蟲, 這次使用 python + selenium + beautifulsoup, 使用 webdriver之chromedriver, 讓chrome選擇UI設定後, 從網頁內容爬取資料. 程式先將每個月的彙整表拉出來, 因為新版彙整表資訊不完整, 缺少經緯度等資料, 所以須開啟每一個地震的詳細資料頁, 重新爬一次

# -*- coding: utf-8 -*-

"""

Created on Sun May 23 14:05:07 2021

@author: ghosty

"""

from selenium import webdriver

from webdriver_manager.chrome import ChromeDriverManager

from selenium.webdriver.chrome.options import Options

from selenium.webdriver.common.keys import Keys

from selenium.webdriver.support.ui import Select

from bs4 import BeautifulSoup

import time

#constant

cwbUrl = 'https://scweb.cwb.gov.tw'

# Create Chrome Page

options = Options()

options.add_argument("--disable-notifications")

chrome = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=options )

def getQuakeUrlList(year, month):

#open CWB web

chrome.get(cwbUrl+'/zh-tw/earthquake/data/')

# setup search Month

xpath='/html/body/div[1]/div[2]/div/div[2]/div[1]/div[1]/div/h2/input[1]' #xpath of quakelist table

searchMonth = chrome.find_element_by_xpath(xpath)

chrome.execute_script('arguments[0].removeAttribute(\"readonly\")', searchMonth)

chrome.find_element_by_xpath(xpath).clear()

searchMonth.send_keys(str(year)+'年'+str(month)+'月')

searchMonth.send_keys(Keys.RETURN)

# setup list opion to 'All

xpath='/html/body/div[1]/div[2]/div/div[2]/div[1]/div[4]/div/div[1]/div/div[2]/div/label/select' #option xpath

tableLength=chrome.find_element_by_xpath(xpath)

Select(tableLength).select_by_index(5) #loption is 'All'

# get html contain

time.sleep(3)

soup = BeautifulSoup(chrome.page_source, 'html.parser')

quakeList = []

quakeTable = soup.find_all('table')[1].find_all('tr')

for row in quakeTable[1:]:

col = row.find_all('td')

quakeList.append(cwbUrl+col[6].find('a')['href'])

print(col[0].text,col[1].text, col[2].text, col[3].text, col[4].text, col[5].text,\

'-',col[6].text[:19], col[6].find('a')['href'], col[6].find('a').text.replace(' ', ''))

return quakeList

def getQuakeInfo(year, month):

quakeUrlList = getQuakeUrlList(year, month)

for quakeUrl in quakeUrlList:

chrome.get(quakeUrl)

time.sleep(2)

soup = BeautifulSoup(chrome.page_source, 'html.parser')

#xpath='/html/body/div[1]/div[2]/div/div[1]/div[2]/div[3]/ul' # info list

#infoList = chrome.find_element_by_xpath(xpath)

infoList = soup.find_all('li')

for i in range(len(infoList)):

#if any(info.text in s for s in ['發震時間','震央位置','地震深度','芮氏規模','相對位置']):

if '發震時間' in infoList[i].text:

print('-----------------')

if (len(infoList[i-1].text.replace('\n','').replace(' ',''))>0):

print(infoList[i-1].text.replace('\n','').replace(' ',''))

else:

print(infoList[i-1].img['alt'])

for j in range(5):

print(infoList[i+j].text.replace('\n','').replace(' ',''))

reportImg = quakeUrl.replace('https://scweb.cwb.gov.tw/zh-tw/earthquake/details/','https://scweb.cwb.gov.tw/zh-tw/earthquake/imgs/')

print('地震報告：'+reportImg)

break # exit loop i

year = 2020

for month in range(3,13):

getQuakeInfo(year, month)

No More Codes

2021年5月23日星期日

[Python] 從中央氣象局下載地震活動彙整列表

沒有留言:

張貼留言

2021年5月23日 星期日

[Python] 從中央氣象局下載地震活動彙整列表

沒有留言:

張貼留言

2021年5月23日星期日