from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="all"


import requests

res = requests.get("https://www.naver.com")
res # HTTP 응답이 담겨 있다.

<Response [200]>


# Header를 확인해보자: .headers
res.headers

{'Server': 'NWS', 'Date': 'Tue, 24 Oct 2023 03:26:22 GMT', 'Content-Type': 'text/html; charset=UTF-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Set-Cookie': 'PM_CK_loc=0460bf98b2b7618c7d521ff71a0d6bc4dc7770777ab110ed223fcf1ec9cc3efd; Expires=Wed, 25 Oct 2023 03:26:22 GMT; Path=/; HttpOnly', 'Cache-Control': 'no-cache, no-store, must-revalidate', 'Pragma': 'no-cache', 'X-Frame-Options': 'DENY', 'X-XSS-Protection': '1; mode=block', 'Content-Encoding': 'gzip', 'Strict-Transport-Security': 'max-age=63072000; includeSubdomains', 'Referrer-Policy': 'unsafe-url'}


# Body를 텍스트 형태로 확인해보자:. text

res.text[:1000] # 너무 기므로 슬라이싱으로 확인해보았다.
# .text는 단순하게 모든 텍스트를 가져온다.
# 즉 우리가 웹페이지에서 일부분의 내용만 가져오고 싶다면 .text 남발은 주의하자

'   <!doctype html> <html lang="ko" class="fzoom"> <head> <meta charset="utf-8"> <meta name="Referrer" content="origin"> <meta http-equiv="X-UA-Compatible" content="IE=edge"> <meta name="viewport" content="width=1190"> <title>NAVER</title> <meta name="apple-mobile-web-app-title" content="NAVER"/> <meta name="robots" content="index,nofollow"/> <meta name="description" content="네이버 메인에서 다양한 정보와 유용한 컨텐츠를 만나 보세요"/> <meta property="og:title" content="네이버"> <meta property="og:url" content="https://www.naver.com/"> <meta property="og:image" content="https://s.pstatic.net/static/www/mobile/edit/2016/0705/mobile_212852414260.png"> <meta property="og:description" content="네이버 메인에서 다양한 정보와 유용한 컨텐츠를 만나 보세요"/> <meta name="twitter:card" content="summary"> <meta name="twitter:title" content=""> <meta name="twitter:url" content="https://www.naver.com/"> <meta name="twitter:image" content="https://s.pstatic.net/static/www/mobile/edit/2016/0705/mobile_212852414260.png"> <meta name="twitter:description" c'


# webhook에서 제공 받은 유니크 URL에
# payload와 함께 POST를 보내보자
payload = {"name": "Hello", "age": 13} # HTTP에서는 대부분 JSON 형태로 정보를 주고 받는다.

res = requests.post("https://webhook.site/42c078a5-74e9-499f-a1f9-1a24200570c9", payload)
res

<Response [200]>


# 상태 코드(status code)로 상태만 확인해 볼 수도 있다.
res.status_code # 상태만
res.headers # 헤더 전부

200

{'Server': 'nginx', 'Content-Type': 'text/plain; charset=UTF-8', 'Transfer-Encoding': 'chunked', 'Vary': 'Accept-Encoding', 'X-Request-Id': '406c7fcd-8e75-4ec6-b6eb-867b55d266c9', 'X-Token-Id': '42c078a5-74e9-499f-a1f9-1a24200570c9', 'Cache-Control': 'no-cache, private', 'Date': 'Tue, 24 Oct 2023 03:34:39 GMT', 'Content-Encoding': 'gzip'}


import requests

res = requests.get("https://www.naver.com/robots.txt")


res.text
print(res.text)

'User-agent: *\nDisallow: /\nAllow : /$ \n'

User-agent: *
Disallow: /
Allow : /$


User-agent: * #모든 유저에게
Disallow: / #요청을 거절하고
Allow : /$ #https://www.naver.com 까지만 접근을 허용한다. (블로그, 뉴스 등 X)


print('프로그래머스')
res=requests.get("https://www.programmers.co.kr/robots.txt")
print(res.text)

print('백준')
res=requests.get("https://acmicpc.net/robots.txt")
print(res.text)

print('솔브닥')
res=requests.get("https://solved.ac/robots.txt")
print(res.text[:100])

프로그래머스
User-Agent: *

Disallow: /users
Disallow: /managers
Disallow: /cable
Disallow: /admin
Disallow: /start_trial
Disallow: /pr/*
Allow: /

Sitemap: https://programmers.co.kr/sitemaps/sitemap.xml

백준
<html>
<head><title>403 Forbidden</title></head>
<body>
<center><h1>403 Forbidden</h1></center>
</body>
</html>

솔브닥
<!DOCTYPE html><html lang="ko"><head><meta charSet="utf-8"/><title>solved.ac - robots.txt</title><me

Selenium의 기초 WebDriver (0)	2023.10.26
요청시 헤더의 수정과 페이지네이션(Pagination) (0)	2023.10.25
스크래핑할 요소의 타게팅 - ID & Class (0)	2023.10.25
스크래핑할 요소의 타게팅 - 태그 (0)	2023.10.25
파싱과 스크래핑의 기본, BeautifulSoup (0)	2023.10.25

터칭 데이터

터칭 데이터

requests 라이브러리 본문

requests 라이브러리

정보를 달라고 요청하기, GET¶

정보 갱신하는 것을 요청하기, POST¶

robots.txt 가져오기¶

'웹 스크래핑(Web scraping)' 카테고리의 다른 글

티스토리툴바

« 2025/05 »
일	월	화	수	목	금	토
				1	2	3
4	5	6	7	8	9	10
11	12	13	14	15	16	17
18	19	20	21	22	23	24
25	26	27	28	29	30	31