from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


# 다음 User-Agent를 추가해봅시다.

user_agent = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}


# 필요한 라이브러리를 불러온 후, 요청을 진행해봅시다.
# 응답을 바탕으로 BeautifulSoup 객체를 생성해봅시다.
# 질문의 빈도를 체크하는 dict를 만든 후, 빈도를 체크해봅시다.

import requests
import time
from bs4 import BeautifulSoup

frequency = dict()

for i in range(1, 11):
    res = requests.get(f"https://hashcode.co.kr/?page={i}", user_agent)
    soup = BeautifulSoup(res.text, "html.parser")

    # 1. ul 태그 전부 다 찾기
    # 2. 1번 안에 있는 li 태그의 text를 추출

    ul_tags = soup.find_all("ul", "question-tags")
    for ul in ul_tags:
        li_tags = ul.find_all("li")
        for li in li_tags:
            tag_name = li.text.strip()
            frequency[tag_name] = frequency.get(tag_name, 0) + 1
    
    time.sleep(0.5)

frequency

{'python': 199,
 'device': 1,
 'mssql': 2,
 'migration': 1,
 'c#': 6,
 'c++': 25,
 'visualstudio': 3,
 'deploy': 1,
 'nginx': 1,
 'stack': 2,
 'token': 1,
 'iframe': 1,
 'react': 7,
 'javascript': 20,
 'css': 7,
 'border': 1,
 'html': 13,
 'selector': 1,
 'div': 1,
 'vscode': 5,
 'flutter': 1,
 'android': 5,
 'code': 3,
 'xcode': 2,
 'heapsort': 2,
 'c': 48,
 'operator': 1,
 'django': 5,
 'serializable': 1,
 'java': 39,
 'node.js': 7,
 'mysql': 3,
 'turtle': 2,
 'sorting': 2,
 'mongodb': 1,
 'directx': 1,
 'pandas': 23,
 'dataframe': 11,
 'excel': 5,
 'scanf': 3,
 'ipc': 1,
 'openai': 1,
 'cv2': 2,
 'gui': 4,
 'windows': 4,
 'opencv': 7,
 'interface': 1,
 'class': 8,
 'inheritance': 2,
 'back-end': 1,
 'front-end': 1,
 'spring-boot': 1,
 'web': 1,
 'security': 1,
 'firewall': 1,
 'selenium': 5,
 'beautifulsoup': 7,
 'urllib': 1,
 'unity': 1,
 'scraping': 2,
 'requests': 1,
 'spring': 4,
 'floatig-point': 1,
 'editor': 1,
 'mfc': 2,
 'pointer': 1,
 'array': 3,
 'visual-studio-2010': 1,
 'http': 1,
 'socket-programming': 1,
 'coding-test': 10,
 'numpy': 4,
 'algorithm': 6,
 'tkinter': 4,
 'import': 3,
 'database': 2,
 'crawling': 8,
 'naver': 1,
 'axios': 3,
 's3': 1,
 'cloudfront': 1,
 'boot': 1,
 'exception': 1,
 'error': 9,
 'querying': 1,
 'save': 1,
 'git': 2,
 'bat': 1,
 'batch': 1,
 'python3': 9,
 'bfs': 2,
 'statsmodels': 1,
 'file': 2,
 'txt': 1,
 'stackoverflow': 1,
 'ubuntu': 4,
 'gcc': 1,
 'intellij-idea': 2,
 'global-variable': 1,
 'pyinstaller': 3,
 'next.js': 1,
 'cookie': 1,
 'arduino': 3,
 'data-structure': 3,
 'mecab': 1,
 'parameter': 2,
 'linux': 2,
 'web-crawling': 2,
 'system': 2,
 'vim': 1,
 'struct': 2,
 'generic': 1,
 'deep-learning': 1,
 'keyerror': 2,
 'matplotlib': 1,
 'replace': 1,
 'ai': 2,
 'software_development': 1,
 'javac': 1,
 'vmware': 1,
 'multithreading': 1,
 'regex': 3,
 'multiprocessing': 1,
 'pygame': 1,
 'application-development': 1,
 'logistic-regression': 1,
 'logistic': 1,
 'csv': 2,
 'instance': 1,
 'hashmap': 1,
 'object': 1,
 'for': 5,
 'selenium-webdrive': 5,
 'win32': 1,
 'recursive': 3,
 'flatten': 1,
 'dictionary': 3,
 'initialization': 2,
 'qt': 1,
 'pdf': 1,
 'hashcode': 1,
 'kotlin': 4,
 'private': 1,
 'image': 3,
 'netlify': 1,
 'data': 2,
 'directory': 1,
 'raspberry-pi': 2,
 'dom': 1,
 'asynchronous': 1,
 'jquery': 2,
 'vb.net': 1,
 'post': 1,
 'webrequest': 1,
 'html5': 2,
 'accordion': 1,
 'user-interface': 1,
 'openpyxl': 3,
 'function': 4,
 'sql': 1,
 'dijkstra': 1,
 'binary-search-tree': 1,
 'index': 1,
 'pip': 2,
 'install': 1,
 'terminal': 1,
 'alias': 1,
 'map': 1,
 'linked-list': 2,
 'switch문': 1,
 'coding': 2,
 '.net': 1,
 'vector': 1,
 'ejs': 1,
 'if문': 1,
 'print': 2,
 'prettier': 1,
 'library': 2,
 'linux-kernel': 1,
 'recursion': 2,
 'game': 3,
 'min': 1,
 'max': 1,
 'thread': 1,
 'javafx': 1,
 'foreach': 2,
 'unix': 1,
 'logging': 1,
 'keras': 1,
 'winform': 2,
 'wpf': 1,
 'testing': 1,
 'filter': 1,
 'stringbuilder': 1,
 'c++표준': 1,
 'method': 2,
 'cdn': 1,
 'decryption': 1,
 'postfix': 1,
 'tree': 1,
 'duplicate': 1,
 'ssh': 1,
 'append': 1,
 'decorator': 1,
 'string': 4,
 'ide': 1,
 'anaconda': 1,
 'import-에러': 1,
 'webdriver': 1,
 '403clienterror': 1,
 'collections': 1,
 'join': 1,
 'assembly': 1,
 'random': 2,
 'programming': 2,
 'input': 1,
 'export': 1,
 'visual-studio': 3,
 'while-loop': 2,
 'ajax': 1,
 'crypto': 1,
 'init': 1,
 'def': 2,
 'byte': 1,
 'utf-8': 1,
 'syntax-error': 1,
 'eclipse': 2,
 'preference': 1,
 'plugin': 1,
 'int': 1,
 'rstudio': 1,
 'r': 1,
 'formatting': 1,
 'pycrypto': 1,
 'word2vec': 2,
 'list': 4,
 'output': 1,
 'nlp': 1,
 'json': 1,
 'tensorflow': 1,
 'header': 1,
 'io': 1,
 'restframework': 1,
 'encryption': 1,
 'postgresql': 1,
 'android-studio': 2,
 'polynomial': 1,
 'network': 1,
 'port': 1,
 'name': 1,
 'fortran': 1,
 'regexp': 1,
 'service': 1,
 'gps': 1,
 'loops': 1,
 'scala': 1,
 'python-3.x': 1,
 'continue': 1,
 'this': 1,
 'mac': 1,
 'github': 3,
 'keyboard-shortcut': 1,
 'typescript': 1,
 'npm': 1,
 'ios': 1,
 'webview': 1,
 'webapp': 1,
 'ruby-on-rails': 2,
 'float': 1,
 'return': 1,
 'tuple': 1,
 'py': 1,
 'conditional-statement': 1,
 'vue': 1,
 'get': 1,
 'session': 1,
 'psycopg2': 1,
 'table': 1,
 'socket': 1,
 'asyncio': 1,
 'ruby': 1}


# Counter를 사용해 가장 빈도가 높은 value들을 추출합니다.

from collections import Counter

counter = Counter(frequency)

counter.most_common(10) # 상위 10개

[('python', 199),
 ('c', 48),
 ('java', 39),
 ('c++', 25),
 ('pandas', 23),
 ('javascript', 20),
 ('html', 13),
 ('dataframe', 11),
 ('coding-test', 10),
 ('error', 9)]


# Seaborn을 이용해 이를 Barplot으로 그립니다.

import seaborn as sns

x = [elem[0] for elem in counter.most_common(10)]
y = [elem[1] for elem in counter.most_common(10)]
print(x)
print(y)

sns.barplot(x = x, y = y)

['python', 'c', 'java', 'c++', 'pandas', 'javascript', 'html', 'dataframe', 'coding-test', 'error']
[199, 48, 39, 25, 23, 20, 13, 11, 10, 9]

<Axes: >


# figure, xlabel, ylabel, title을 적절하게 설정해서 시각화를 완성해봅시다.

import matplotlib.pyplot as plt

plt.figure(figsize=(20, 10))
plt.title("Frequency of question in Hashcode")
plt.xlabel("Tag")
plt.ylabel("Frequency")

sns.barplot(x = x, y = y)
plt.show()

<Figure size 2000x1000 with 0 Axes>

Text(0.5, 1.0, 'Frequency of question in Hashcode')

Text(0.5, 0, 'Tag')

Text(0, 0.5, 'Frequency')

<Axes: title={'center': 'Frequency of question in Hashcode'}, xlabel='Tag', ylabel='Frequency'>

스크래핑 결과 시각화 - Hashcode

4-2. 스크래핑 결과 시각화하기 I - 해시코드 질문태그 빈도 시각화¶

Target: 해시코드 질문 태그의 빈도 확인¶