Python (파이썬) 크롤링 - 한국 홀리데이 스크랩

2020. 5. 4. 20:09

https://www.timeanddate.com/holidays/south-korea/

Holidays and observances in South Korea in 2020

Time of Remembrance and Reconciliation for Those Who Lost Their Lives during the Second World War The United Nations’ (UN) Time of Remembrance and Reconciliation for Those Who Lost Their Lives during the Second World War is annually held over two days, f

www.timeanddate.com

방법 1. 수업 때 배운 가장 고전적 방법

from selenium import webdriver
import requests, bs4
import pandas as pd

url = 'https://www.timeanddate.com/holidays/south-korea/'
response = requests.get(url).text.encode('utf-8')
response = bs4.BeautifulSoup(response, 'html.parser')

# 1. 테이블 영역 찾기.
tableData = response.find('table',{'id':'holidays-table'})
# tableData

# 2. 컬럼 헤더 찾기.
theadTags = tableData.find('thead')
theadTags

thTags = theadTags.find_all('th')
thTags

columnHeaderList = []

thTagsLen = len(thTags)
for i in range(0,thTagsLen):
    elements = thTags[i].text.strip()
    columnHeaderList.append(elements)
    
columnHeaderList    # 결과 : ['Date', '', 'Name', 'Type']

# 3. body 데이터 찾기.
tbodyTags = tableData.find('tbody')
thTags = tbodyTags.find_all('th')
# thTags[0].text # 디버깅용. (0:30)

tdTags = tbodyTags.find_all('td')
# tdTags[i].text # 디버깅용. (0:30) * 3이라 (0:90)이다.

rowList = []
columnList = []

thTagsLen = len(thTags)
for i in range(0,thTagsLen):
    columnList.append(thTags[i].text)
    columnList.append(tdTags[3 * i].text)
    columnList.append(tdTags[3 * i + 1].text)
    columnList.append(tdTags[3 * i + 2].text)
    
    rowList.append(columnList)
    columnList = []
    
rowList

# 결과 :
# [['1월 1일', '수요일', "New Year's Day", 'Public Holiday'],
#   . . .
#  ['12월 31일', '목요일', "New Year's Eve", 'Observance']]

# 4. 데이터프레임 만들기.
koreaHoliday = pd.DataFrame(rowList, columns=columnHeaderList)
koreaHoliday

cf. 매트릭스 리스트를 만드는 과정은 이렇게 바꿀 수 있다.

rowList = []
columnList = []

thTagsLen = len(thTags)
for i in range(0,thTagsLen):
    columnList.append(thTags[i].text)
    columnList.append(tdTags[3 * i].text)
    columnList.append(tdTags[3 * i + 1].text)
    columnList.append(tdTags[3 * i + 2].text)
    
    rowList.append(columnList)
    columnList = []
    
rowList

---------------------------------------------------------------------------------------

# 이렇게 리스트를 2개 만드는 대신 리스트는 rowList 하나만 만들고, append할 때 리스트로 만들어 넣어줄 수도 있다.

rowList = []

thTagsLen = len(thTags)
for i in range(0,thTagsLen):
    h_date = (thTags[i].text)
    h_day = (tdTags[3 * i].text)
    h_name = (tdTags[3 * i + 1].text)
    h_type = (tdTags[3 * i + 2].text)
    
    rowList.append([h_date, h_day, h_name, h_type])
    
rowList

2. 또 다른 방법 (for _ in list이름) 사용

from selenium import webdriver
import requests, bs4
import pandas as pd

url = 'https://www.timeanddate.com/holidays/south-korea/'
response = requests.get(url).text.encode('utf-8')
response = bs4.BeautifulSoup(response, 'html.parser')

# 1. 테이블 영역 찾기.
tableData = response.find('table',{'id':'holidays-table'})
# tableData

# 2. 컬럼 헤더 찾기.
theadTags = tableData.find('thead')
theadTags

thTags = theadTags.find_all('th')
thTags

columnHeaderList = []

for _ in thTags:
    elements = _.text.strip()
    columnHeaderList.append(elements)
    
columnHeaderList    # 결과 : ['Date', '', 'Name', 'Type']

# 3. body 데이터 찾기.
tbodyTags = tableData.find('tbody')
# tbodyTags.text # 디버깅용,

rowList = []
columnList = []

for tb in tbodyTags:
    if tb.text == '':
        continue
        
    thTags = tb.find_all('th')    # thTags를 위 방법과 달리 for tb in tbodyTags 안으로 들여야 한다. 그래서 해당 row 안에서만 돌게 해야 중복되지 않는다.
    for th in thTags:
        columnList.append(th.text)
        
    tdTags = tb.find_all('td')    # tdTags를 위 방법과 달리 for tb in tbodyTags 안으로 들여야 한다. 그래서 해당 row 안에서만 돌게 해야 중복되지 않는다.
    for td in tdTags:
        columnList.append(td.text)

    rowList.append(columnList)
    columnList = []
    
rowList

# 결과 :
# [['1월 1일', '수요일', "New Year's Day", 'Public Holiday'],
#   . . .
#  ['12월 31일', '목요일', "New Year's Eve", 'Observance']]

# 4. 데이터프레임 만들기.
koreaHoliday = pd.DataFrame(rowList, columns=columnHeaderList)
koreaHoliday

사실 # 2. 컬럼 헤더 찾기는 (for _ in list이름) 방법이 괜찮아 보이지만 # 3. body 데이터 찾기는 위에 1번 방법이 더 좋아보인다. 훨씬 직관적.

3. 교수님 예제에 있는 방법

import requests, bs4
import pandas as pd

url = "https://www.timeanddate.com/holidays/south-korea/"
resp = requests.get(url)
websource = resp.text
bsobj = bs4.BeautifulSoup(websource, "html.parser")

# 1. 컬럼 헤더 찾기.
thead = bsobj.select_one("#holidays-table > thead")
thead

# 결과 : <thead><tr><th rowspan="2">Date</th><th rowspan="2"> </th><th rowspan="2">Name</th><th rowspan="2">Type</th></tr><tr></tr></thead>

iter_th = thead.find_all("th")
iter_th

# 결과 :
# [<th rowspan="2">Date</th>,
#  <th rowspan="2"> </th>,
#  <th rowspan="2">Name</th>,
#  <th rowspan="2">Type</th>]

hcontent=[]

for th in iter_th:
    hcontent.append(th.text)

hcontent

# 결과 : ['Date', '\xa0', 'Name', 'Type']

# 2. body 데이터 찾기.
tbody = bsobj.select_one("#holidays-table > tbody")
iter_tr = tbody.find_all("tr")

content=[]
allcontent=[]

#rows = iter_tr[0]

for rows in iter_tr:
    if rows.text=="":
        continue
    ths = rows.find_all("th")    # body 데이터 중 1월 1일 ~ 이 있는 1번째 컬럼. (th 태그로 구성)
    for th in ths:
        content.append(th.text)
    tds = rows.find_all("td")
    for td in tds:
        content.append(td.text)  # body 데이터 나머지 2~3번째 컬럼. (td 태그로 구성)
    allcontent.append(content)
    content=[]
    
# 3. 데이터프레임 만들기.
finalResult = pd.DataFrame(allcontent)
finalResult.columns = hcontent
finalResult

'개발자 > Python' 카테고리의 다른 글

Python (파이썬) Selenium 셀레니움 (0)	2020.05.04
Python (파이썬) 웹 크롤링 기초 - Spark 퀴즈 정보 긁어오기 (Selenium O) (0)	2020.05.04
Python (파이썬) while(True) (0)	2020.04.29
Python (파이썬) select 함수 (0)	2020.04.29
Python (파이썬) Class 클래스, 인스턴스, 생성자(__init__) (0)	2020.04.27

좋은 개발자가 되자

Python (파이썬) 크롤링 - 한국 홀리데이 스크랩

방법 1. 수업 때 배운 가장 고전적 방법

2. 또 다른 방법 (for _ in list이름) 사용

3. 교수님 예제에 있는 방법

'개발자 > Python' 카테고리의 다른 글

+ Recent posts

티스토리툴바