from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all"


# 스크래핑에 필요한 라이브러리를 불러와봅시다.

import requests # 요청, 응답을 위해
from bs4 import BeautifulSoup # 만들어진 DOM의 파싱을 위해


# 예시 사이트에 요청을 진행하고, 응답을 바탕으로 BeautifulSoup 객체를 만들어봅시다.

res = requests.get("http://books.toscrape.com/catalogue/category/books/travel_2/index.html")
soup = BeautifulSoup(res.text, "html.parser")


# <h3> 태그에 해당하는 요소를 하나 찾아봅시다

soup.find("h3")
print(soup.find("h3").prettify())

<h3><a href="../../../its-only-the-himalayas_981/index.html" title="It's Only the Himalayas">It's Only the Himalayas</a></h3>

<h3>
 <a href="../../../its-only-the-himalayas_981/index.html" title="It's Only the Himalayas">
  It's Only the Himalayas
 </a>
</h3>


# <h3> 태그에 해당하는 요소를 모두 찾아봅시다

h3_results = soup.find_all("h3")
print(len(h3_results))
print()

for op in outputs:
    print(op)
    print()

11

<h3><a href="../../../its-only-the-himalayas_981/index.html" title="It's Only the Himalayas">It's Only the Himalayas</a></h3>

<h3><a href="../../../full-moon-over-noahs-ark-an-odyssey-to-mount-ararat-and-beyond_811/index.html" title="Full Moon over Noahâs Ark: An Odyssey to Mount Ararat and Beyond">Full Moon over Noahâs ...</a></h3>

<h3><a href="../../../see-america-a-celebration-of-our-national-parks-treasured-sites_732/index.html" title="See America: A Celebration of Our National Parks &amp; Treasured Sites">See America: A Celebration ...</a></h3>

<h3><a href="../../../vagabonding-an-uncommon-guide-to-the-art-of-long-term-world-travel_552/index.html" title="Vagabonding: An Uncommon Guide to the Art of Long-Term World Travel">Vagabonding: An Uncommon Guide ...</a></h3>

<h3><a href="../../../under-the-tuscan-sun_504/index.html" title="Under the Tuscan Sun">Under the Tuscan Sun</a></h3>

<h3><a href="../../../a-summer-in-europe_458/index.html" title="A Summer In Europe">A Summer In Europe</a></h3>

<h3><a href="../../../the-great-railway-bazaar_446/index.html" title="The Great Railway Bazaar">The Great Railway Bazaar</a></h3>

<h3><a href="../../../a-year-in-provence-provence-1_421/index.html" title="A Year in Provence (Provence #1)">A Year in Provence ...</a></h3>

<h3><a href="../../../the-road-to-little-dribbling-adventures-of-an-american-in-britain-notes-from-a-small-island-2_277/index.html" title="The Road to Little Dribbling: Adventures of an American in Britain (Notes From a Small Island #2)">The Road to Little ...</a></h3>

<h3><a href="../../../neither-here-nor-there-travels-in-europe_198/index.html" title="Neither Here nor There: Travels in Europe">Neither Here nor There: ...</a></h3>

<h3><a href="../../../1000-places-to-see-before-you-die_1/index.html" title="1,000 Places to See Before You Die">1,000 Places to See ...</a></h3>


# book_list에서 우리가 원하는 제목(title)만 추출해봅시다.
# 먼저 h3 태그 한개만 가져와서 살펴보자

book = soup.find("h3")
book

<h3><a href="../../../its-only-the-himalayas_981/index.html" title="It's Only the Himalayas">It's Only the Himalayas</a></h3>


# <h3>태그 바로 아래에 있는 <a>태그를 접근하는 간단한 방법
book.a
# 그 <a>태그의 텍스트에 접근
book.a.text

<a href="../../../its-only-the-himalayas_981/index.html" title="It's Only the Himalayas">It's Only the Himalayas</a>

"It's Only the Himalayas"


h3_results = soup.find_all("h3")

for title in h3_results:
    print(title.a.text)

It's Only the Himalayas
Full Moon over Noahâs ...
See America: A Celebration ...
Vagabonding: An Uncommon Guide ...
Under the Tuscan Sun
A Summer In Europe
The Great Railway Bazaar
A Year in Provence ...
The Road to Little ...
Neither Here nor There: ...
1,000 Places to See ...


for title in h3_results:
    print(title.a["title"])

It's Only the Himalayas
Full Moon over Noahâs Ark: An Odyssey to Mount Ararat and Beyond
See America: A Celebration of Our National Parks & Treasured Sites
Vagabonding: An Uncommon Guide to the Art of Long-Term World Travel
Under the Tuscan Sun
A Summer In Europe
The Great Railway Bazaar
A Year in Provence (Provence #1)
The Road to Little Dribbling: Adventures of an American in Britain (Notes From a Small Island #2)
Neither Here nor There: Travels in Europe
1,000 Places to See Before You Die

일	월	화	수	목	금	토
		1	2	3	4	5
6	7	8	9	10	11	12
13	14	15	16	17	18	19
20	21	22	23	24	25	26
27	28	29	30	31

Selenium의 기초 WebDriver (0)	2023.10.26
요청시 헤더의 수정과 페이지네이션(Pagination) (0)	2023.10.25
스크래핑할 요소의 타게팅 - ID & Class (0)	2023.10.25
파싱과 스크래핑의 기본, BeautifulSoup (0)	2023.10.25
requests 라이브러리 (0)	2023.10.24

터칭 데이터

터칭 데이터

스크래핑할 요소의 타게팅 - 태그 본문

스크래핑할 요소의 타게팅 - 태그

2-3. 원하는 요소 가져오기 I - 책 이름 모으기¶

Target: Mock Book Data¶

가져온 객체에서 책 제목 뽑아보기¶

먼저 h3 태그 한개만 살펴보자¶

모든 h3 태그들에 접근해 책 제목을 가져오는 방법¶

반복문을 사용¶

중간에 잘리지 않은 완전한 형태의 책제목 얻기¶

어떤 태그의 속성(attribute)에 접근하는 방법은 딕셔너리와 같다.¶

'웹 스크래핑(Web scraping)' 카테고리의 다른 글

티스토리툴바