2022-08-13 update
크롤링 관련 글을 보면 찔끔찔금 올려두거나 부분부분 잘라서 올려둔 것들이 많아서 막상 따라하면 오류를 뿜거나 작성한지 몇 년이 지나서 웹사이트 구조가 바뀐 것이 반영이 안 되어 실제 동작이 안되는 것이 더러 있다.
예전에 연습할 때 모아두었다가 잘된 것만 스크랩 해두었는데 그 중에 기억남는 것을 다시 현 시점에서 실행이 잘 되는지 따라 해보자.
블로그 글(https://kwonkyo.tistory.com/529) 은 네이버 쇼핑항목을 따라해 보면서 처음으로? 성공하고선 기뻐했던 기억이 있다. 그리고 실행법도 자세히 설명하고 있어서 먼저 보아야 한다. 여기는 소스를 그대로 복사한 다음 검색어만 바꾸어 보았다. 현재도 여전히 실행이 잘 된다. 링크를 타고 들어가 제대로 익혀봅시다
여전히 잘 작동하는 소스코드, 따라서 일단 실행해 보자
예제1, 결과가.... 5개만 보이네
아래 코드에서 9~19번줄에 각 항목이 네이버쇼핑에서 선택하는 항목이다.
이 부분의 "구조"를 이해하면 응용하거나 오류를 잡아내는데 좋다.
## 네이버 쇼핑 크롤링 * request / urllib,urlopen 사용
import requests
from urllib.request import urlopen
import urllib
from bs4 import BeautifulSoup
#검색할 단어
searchText = "다이소 실내 슬리퍼"
startpage = "1"
display = "40"
byList = "total"
sort = "rel" #네이버가격순
timestamp = ""
viewType = "list"
encText = urllib.parse.quote(searchText)
searchOption = [encText, startpage, display, byList,encText, sort, timestamp, viewType]
# https://search.shopping.naver.com/search/all?
# frm=NVSHATC
# &origQuery=%ED%8E%B8%EB%B0%B1%EC%B0%9C%EA%B8%B0
# &pagingIndex=1
# &pagingSize=40
# &productSet=total
# &query=%ED%8E%B8%EB%B0%B1%EC%B0%9C%EA%B8%B0
# &sort=rel
# ×tamp=
# &viewType=list
def scrap():
url = ("https://search.shopping.naver.com/search/all?&frm=NVSHATC"
# "https://search.shopping.naver.com/search/category?frm=NVSHOVS&origQuery&"
"&origQuery=" + searchOption[0] + # query: 검색 키워드
"&pagingIndex=" + searchOption[1] + # 조회할 페이지번호
"&pagingSize=" + searchOption[2] + # 페이지당 검색수
"&productSet=" + searchOption[3] + # 해외직구,overseas
"&query=" + searchOption[4] + #
"&sort=" + searchOption[5] + # sort=rel(네이버가격순)
"×tamp=" + searchOption[6] + #
"&viewType=" + searchOption[7] ) # viewType=list(리스트보기)
# res = requests.get(url)
# bs = BeautifulSoup(res.content, 'html.parser')
bs = BeautifulSoup(urlopen(url),"html.parser")
product_list = bs.select("li[class^='basicList_item']")
cnt = len(bs.find_all('div', class_='basicList_title__3P9Q7'))
print(f'-url주소: {url}', f'-상품수: {cnt}','' ,sep="\n")
# print(cnt)
items = bs.find_all(attrs={"class":"basicList_link__1MaTN"})
for i in range(len(items)):
print(i+1, items[i].get_text())
for li in product_list:
for goods in li.contents:
title = price = registerdate = 0
title = goods.select("a[class^='basicList_link__']")[0].text
price = goods.select("span[class^='price_num__']")[0].text
registerdate = goods.select("div[class^='basicList_etc_box__'] > span")[0].text
print(title, price, registerdate)
if __name__ == "__main__":
scrap()
그런데 문제는 결과가 많아야 하는데 5개만 표시된다.
실제로 검색한 페이지를 보려면 파란색 url주소를 복사해서 크롬 브라우저에 붙여넣기 해본다.
실제로는 아래 주소로 들어가면 검색되는 항목은 엄청 많다
https://search.shopping.naver.com/search/all?&frm=NVSHATC&origQuery=%EB%8B%A4%EC%9D%B4%EC%86%8C%20%EC%8B%A4%EB%82%B4%20%EC%8A%AC%EB%A6%AC%ED%8D%BC&pagingIndex=1&pagingSize=40&productSet=total&query=%EB%8B%A4%EC%9D%B4%EC%86%8C%20%EC%8B%A4%EB%82%B4%20%EC%8A%AC%EB%A6%AC%ED%8D%BC&sort=rel×tamp=&viewType=list
예제2, 이제야 제대로 보인다
Convert curl commands to code
GitHub is matching all contributions to this project on GitHub Sponsors. Contribute Now
curlconverter.com
EveryX님 글에서의 설명을 잘 따라해 보자.
## 네이버 쇼핑 크롤링 예제2 * request / curlconverter.com 사용
import requests
import json
def scrap():
cookies = {
'NNB': 'IRS72UA5CVSGE',
'autocomplete': 'use',
'AD_SHP_BID': '22',
'nid_inf': '1664549890',
'NID_JKL': 'oVX5xdcK9e5k2nd9c2ugMch8Emj80hDgw0XbSKVn4MM=',
'NaverSuggestUse': 'unuse%26use',
'nx_ssl': '2',
'NID_AUT': 'Hmmps7vzk0b5qOfrEPlMPAfJNV5csZ3Wntjl6/2Frq/m4bvrcKg0zhDHbvkokuDS',
'_ga': 'GA1.2.1224001448.1650726173',
'_ga_7VKFYR6RV1': 'GS1.1.1659834690.13.0.1659834690.60',
'ASID': '7db0ee2e000001827e28ca1800000058',
'ncpa': '613264|l6rvnxz4|c02e828c89beb2f1bf6dc266cc77a490adf63021|s_21c5ebc27770|48d4834130422bba84845d1671ec03e82b3b0cac:1510716|l6rvp7i0|de9f40e3fecf79a7bbf72a06abdc6eac61e2dcab|s_c1ad54f15163|f339dce04935ed864c9d2500f8e901c903796663',
'page_uid': 'hYPARdp0J1sssvlFkHdssssstdw-025477',
'spage_uid': 'hYPARdp0J1sssvlFkHdssssstdw-025477',
'NID_SES': 'AAABuW8YluCssMERON7C0avxrCx4me89rNSLb8/9ylgFZiLNGV1TUBflu+K9k6GTBzKdhZ+0LbMhQlUtw3uqPb2mi0r9ptpR+xtUGdhY62b1m5t7T/g7TS3U9IEqrE4lpvCpzhv7nLkb1808XtixttrNY/vzfrmVUtkKWsclu/5iJFzUm1I8ai2z8GGImmtr4NRoXrkFVlKIE/gPtf034leURsDywf941MmVMrIj8A2f8ZEgf7IXe7jgS17RyddHT7lFGfouoid+PT9aHGP/6OtNsUl2iXmIkHodZyToHLpy+oSoK3QnELRaBENRgvo31u+pxTmB6u7m6OuYwFjmDAi3WiZWD6AKIATHSWHnJ4S90rH2bnguvm8LueQSGvx//+iFHzjdObkn0RHjVFkRvoakaCPhmwdcqLKttraL/ei+ExsxH3fy6ujJTJ9gdYN+4GCoyqTupdorFgYeXQEAf6qzJkjyMl0x2ttCmk9uOOuu7knQPBBN2JE/xU4/Tlbm4WxDCuDrBWaZSht9L9vAS1KujL3qieWbryRezZXEmpep1c7WXhMR3+81AOm23uqW6/xSarCgo3fCxK4/MYs85WCFVRM=',
'sus_val': 'Z9e1ECx58ewCN2iplpsgxIl7',
}
headers = {
'authority': 'search.shopping.naver.com',
'accept': 'application/json, text/plain, */*',
'accept-language': 'en,ko;q=0.9,ko-KR;q=0.8',
# Requests sorts cookies= alphabetically
# 'cookie': 'NNB=IRS72UA5CVSGE; autocomplete=use; AD_SHP_BID=22; nid_inf=1664549890; NID_JKL=oVX5xdcK9e5k2nd9c2ugMch8Emj80hDgw0XbSKVn4MM=; NaverSuggestUse=unuse%26use; nx_ssl=2; NID_AUT=Hmmps7vzk0b5qOfrEPlMPAfJNV5csZ3Wntjl6/2Frq/m4bvrcKg0zhDHbvkokuDS; _ga=GA1.2.1224001448.1650726173; _ga_7VKFYR6RV1=GS1.1.1659834690.13.0.1659834690.60; ASID=7db0ee2e000001827e28ca1800000058; ncpa=613264|l6rvnxz4|c02e828c89beb2f1bf6dc266cc77a490adf63021|s_21c5ebc27770|48d4834130422bba84845d1671ec03e82b3b0cac:1510716|l6rvp7i0|de9f40e3fecf79a7bbf72a06abdc6eac61e2dcab|s_c1ad54f15163|f339dce04935ed864c9d2500f8e901c903796663; page_uid=hYPARdp0J1sssvlFkHdssssstdw-025477; spage_uid=hYPARdp0J1sssvlFkHdssssstdw-025477; NID_SES=AAABuW8YluCssMERON7C0avxrCx4me89rNSLb8/9ylgFZiLNGV1TUBflu+K9k6GTBzKdhZ+0LbMhQlUtw3uqPb2mi0r9ptpR+xtUGdhY62b1m5t7T/g7TS3U9IEqrE4lpvCpzhv7nLkb1808XtixttrNY/vzfrmVUtkKWsclu/5iJFzUm1I8ai2z8GGImmtr4NRoXrkFVlKIE/gPtf034leURsDywf941MmVMrIj8A2f8ZEgf7IXe7jgS17RyddHT7lFGfouoid+PT9aHGP/6OtNsUl2iXmIkHodZyToHLpy+oSoK3QnELRaBENRgvo31u+pxTmB6u7m6OuYwFjmDAi3WiZWD6AKIATHSWHnJ4S90rH2bnguvm8LueQSGvx//+iFHzjdObkn0RHjVFkRvoakaCPhmwdcqLKttraL/ei+ExsxH3fy6ujJTJ9gdYN+4GCoyqTupdorFgYeXQEAf6qzJkjyMl0x2ttCmk9uOOuu7knQPBBN2JE/xU4/Tlbm4WxDCuDrBWaZSht9L9vAS1KujL3qieWbryRezZXEmpep1c7WXhMR3+81AOm23uqW6/xSarCgo3fCxK4/MYs85WCFVRM=; sus_val=Z9e1ECx58ewCN2iplpsgxIl7',
'dnt': '1',
'logic': 'PART',
'referer': 'https://search.shopping.naver.com/search/all?query=%EB%8B%A4%EC%9D%B4%EC%86%8C%20%EC%8B%A4%EB%82%B4%20%EC%8A%AC%EB%A6%AC%ED%8D%BC&frm=NVSHATC&prevQuery=%EB%8B%A4%EC%9D%B4%EC%86%8C%20%EC%8B%A4%EB%82%B4%20%EC%8A%AC%EB%A6%AC%ED%8D%BC',
'sec-ch-ua': '"Chromium";v="104", " Not A;Brand";v="99", "Google Chrome";v="104"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36',
}
params = {
'sort': 'rel',
'pagingIndex': '1',
'pagingSize': '40',
'viewType': 'list',
'productSet': 'total',
'deliveryFee': '',
'deliveryTypeValue': '',
'frm': 'NVSHATC',
'query': '다이소 실내 슬리퍼',
'origQuery': '다이소 실내 슬리퍼',
'iq': '',
'eq': '',
'xq': '',
}
response = requests.get('https://search.shopping.naver.com/api/search/all', params=params, cookies=cookies, headers=headers)
itemlist = json.loads(response.text)
itemlist['shoppingResult']['products']
print(f'-url주소: {response.url}', f'-상품수: {len(itemlist)}','' ,sep="\n")
for i in itemlist['shoppingResult']['products']:
title = i['productTitle']
price = i['price'],
registerdate = i['openDate'],
print(title, price, registerdate)
if __name__ == "__main__":
scrap()
실행하면 값이 잘 나온다
예제3, 조금 더 편의성을 고려해 보자
그런데 위처럼 하면 검색어가 바뀌면 매번 curlconverter.com에 들어가서 소스코드를 복사한 다음, 그것을 내 코드에 복사해 두고 수정을 해야 한다. 조금 번거롭네...
그래서 예제1번의 검색항목과 조건을 [리스트]로 만들어 둔 것을 활용해서 약간 손을 봐봅시다.
request.get 함수를 실행하면서 들어가는 headers / params을 #검색할 단어_ 라고 만들어둔 리스트에서 불러오는 것으로 수정해 보았다. 그럼 매번 변경하지 않고 searchText 항목에 검색어를 변경하면 빠르게 찾을 수 있다.
## 네이버 쇼핑 크롤링 예제3 * request / curlconverter.com 사용
import json #추가1
import urllib #추가2
import requests
def scrap():
#검색할 단어
searchText = "다이소 실내 슬리퍼"
startpage = "1"
display = "40"
byList = "total"
sort = "rel" #네이버가격순
timestamp = ""
viewType = "list"
encText = urllib.parse.quote(searchText)
searchOption = [encText, startpage, display, byList, encText, sort, timestamp, viewType, searchText]
encTextforURL = encText + "&frm=NVSHATC&prevQuery=" + encText
# '다이소 실내 슬리퍼&frm=NVSHATC&prevQuery=다이소 실내 슬리퍼'
# urllib.parse.unquote("%EB%8B%A4%EC%9D%B4%EC%86%8C%20%EC%8B%A4%EB%82%B4%20%EC%8A%AC%EB%A6%AC%ED%8D%BC&frm=NVSHATC&prevQuery=%EB%8B%A4%EC%9D%B4%EC%86%8C%20%EC%8B%A4%EB%82%B4%20%EC%8A%AC%EB%A6%AC%ED%8D%BC")
# https://search.shopping.naver.com/search/all?
# frm=NVSHATC
# &origQuery=%ED%8E%B8%EB%B0%B1%EC%B0%9C%EA%B8%B0
# &pagingIndex=1
# &pagingSize=40
# &productSet=total
# &query=%ED%8E%B8%EB%B0%B1%EC%B0%9C%EA%B8%B0
# &sort=rel
# ×tamp=
# &viewType=list
# url = ("https://search.shopping.naver.com/search/all?&frm=NVSHATC"
# "https://search.shopping.naver.com/search/category?frm=NVSHOVS&origQuery&"
# "&origQuery=" + searchOption[0] + # query: 검색 키워드
# "&pagingIndex=" + searchOption[1] + # 조회할 페이지번호
# "&pagingSize=" + searchOption[2] + # 페이지당 검색수
# "&productSet=" + searchOption[3] + # 해외직구,overseas
# "&query=" + searchOption[4] + #
# "&sort=" + searchOption[5] + # sort=rel(네이버가격순)
# "×tamp=" + searchOption[6] + #
# "&viewType=" + searchOption[7] ) # viewType=list(리스트보기)
cookies = {
'NNB': 'IRS72UA5CVSGE',
'autocomplete': 'use',
'AD_SHP_BID': '22',
'nid_inf': '1664549890',
'NID_JKL': 'oVX5xdcK9e5k2nd9c2ugMch8Emj80hDgw0XbSKVn4MM=',
'NaverSuggestUse': 'unuse%26use',
'nx_ssl': '2',
'NID_AUT': 'Hmmps7vzk0b5qOfrEPlMPAfJNV5csZ3Wntjl6/2Frq/m4bvrcKg0zhDHbvkokuDS',
'_ga': 'GA1.2.1224001448.1650726173',
'_ga_7VKFYR6RV1': 'GS1.1.1659834690.13.0.1659834690.60',
'ASID': '7db0ee2e000001827e28ca1800000058',
'ncpa': '613264|l6rvnxz4|c02e828c89beb2f1bf6dc266cc77a490adf63021|s_21c5ebc27770|48d4834130422bba84845d1671ec03e82b3b0cac:1510716|l6rvp7i0|de9f40e3fecf79a7bbf72a06abdc6eac61e2dcab|s_c1ad54f15163|f339dce04935ed864c9d2500f8e901c903796663',
'page_uid': 'hYPARdp0J1sssvlFkHdssssstdw-025477',
'spage_uid': 'hYPARdp0J1sssvlFkHdssssstdw-025477',
'NID_SES': 'AAABuW8YluCssMERON7C0avxrCx4me89rNSLb8/9ylgFZiLNGV1TUBflu+K9k6GTBzKdhZ+0LbMhQlUtw3uqPb2mi0r9ptpR+xtUGdhY62b1m5t7T/g7TS3U9IEqrE4lpvCpzhv7nLkb1808XtixttrNY/vzfrmVUtkKWsclu/5iJFzUm1I8ai2z8GGImmtr4NRoXrkFVlKIE/gPtf034leURsDywf941MmVMrIj8A2f8ZEgf7IXe7jgS17RyddHT7lFGfouoid+PT9aHGP/6OtNsUl2iXmIkHodZyToHLpy+oSoK3QnELRaBENRgvo31u+pxTmB6u7m6OuYwFjmDAi3WiZWD6AKIATHSWHnJ4S90rH2bnguvm8LueQSGvx//+iFHzjdObkn0RHjVFkRvoakaCPhmwdcqLKttraL/ei+ExsxH3fy6ujJTJ9gdYN+4GCoyqTupdorFgYeXQEAf6qzJkjyMl0x2ttCmk9uOOuu7knQPBBN2JE/xU4/Tlbm4WxDCuDrBWaZSht9L9vAS1KujL3qieWbryRezZXEmpep1c7WXhMR3+81AOm23uqW6/xSarCgo3fCxK4/MYs85WCFVRM=',
'sus_val': 'Z9e1ECx58ewCN2iplpsgxIl7',
}
headers = {
'authority': 'search.shopping.naver.com',
'accept': 'application/json, text/plain, */*',
'accept-language': 'en,ko;q=0.9,ko-KR;q=0.8',
# Requests sorts cookies= alphabetically
# 'cookie': 'NNB=IRS72UA5CVSGE; autocomplete=use; AD_SHP_BID=22; nid_inf=1664549890; NID_JKL=oVX5xdcK9e5k2nd9c2ugMch8Emj80hDgw0XbSKVn4MM=; NaverSuggestUse=unuse%26use; nx_ssl=2; NID_AUT=Hmmps7vzk0b5qOfrEPlMPAfJNV5csZ3Wntjl6/2Frq/m4bvrcKg0zhDHbvkokuDS; _ga=GA1.2.1224001448.1650726173; _ga_7VKFYR6RV1=GS1.1.1659834690.13.0.1659834690.60; ASID=7db0ee2e000001827e28ca1800000058; ncpa=613264|l6rvnxz4|c02e828c89beb2f1bf6dc266cc77a490adf63021|s_21c5ebc27770|48d4834130422bba84845d1671ec03e82b3b0cac:1510716|l6rvp7i0|de9f40e3fecf79a7bbf72a06abdc6eac61e2dcab|s_c1ad54f15163|f339dce04935ed864c9d2500f8e901c903796663; page_uid=hYPARdp0J1sssvlFkHdssssstdw-025477; spage_uid=hYPARdp0J1sssvlFkHdssssstdw-025477; NID_SES=AAABuW8YluCssMERON7C0avxrCx4me89rNSLb8/9ylgFZiLNGV1TUBflu+K9k6GTBzKdhZ+0LbMhQlUtw3uqPb2mi0r9ptpR+xtUGdhY62b1m5t7T/g7TS3U9IEqrE4lpvCpzhv7nLkb1808XtixttrNY/vzfrmVUtkKWsclu/5iJFzUm1I8ai2z8GGImmtr4NRoXrkFVlKIE/gPtf034leURsDywf941MmVMrIj8A2f8ZEgf7IXe7jgS17RyddHT7lFGfouoid+PT9aHGP/6OtNsUl2iXmIkHodZyToHLpy+oSoK3QnELRaBENRgvo31u+pxTmB6u7m6OuYwFjmDAi3WiZWD6AKIATHSWHnJ4S90rH2bnguvm8LueQSGvx//+iFHzjdObkn0RHjVFkRvoakaCPhmwdcqLKttraL/ei+ExsxH3fy6ujJTJ9gdYN+4GCoyqTupdorFgYeXQEAf6qzJkjyMl0x2ttCmk9uOOuu7knQPBBN2JE/xU4/Tlbm4WxDCuDrBWaZSht9L9vAS1KujL3qieWbryRezZXEmpep1c7WXhMR3+81AOm23uqW6/xSarCgo3fCxK4/MYs85WCFVRM=; sus_val=Z9e1ECx58ewCN2iplpsgxIl7',
'dnt': searchOption[1],
'logic': 'PART',
# 'referer': 'https://search.shopping.naver.com/search/all?query=%EB%8B%A4%EC%9D%B4%EC%86%8C%20%EC%8B%A4%EB%82%B4%20%EC%8A%AC%EB%A6%AC%ED%8D%BC&frm=NVSHATC&prevQuery=%EB%8B%A4%EC%9D%B4%EC%86%8C%20%EC%8B%A4%EB%82%B4%20%EC%8A%AC%EB%A6%AC%ED%8D%BC',
'referer': 'https://search.shopping.naver.com/search/all?query='+ encTextforURL,
'sec-ch-ua': '"Chromium";v="104", " Not A;Brand";v="99", "Google Chrome";v="104"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36',
}
params = {
'sort': searchOption[5],
'pagingIndex': searchOption[1],
'pagingSize': searchOption[2],
'viewType': searchOption[7],
'productSet': 'total',
'deliveryFee': '',
'deliveryTypeValue': '',
'frm': 'NVSHATC',
'query': searchOption[8],
'origQuery': searchOption[8],
'iq': '',
'eq': '',
'xq': '',
}
response = requests.get('https://search.shopping.naver.com/api/search/all', params=params, cookies=cookies, headers=headers)
itemlist = json.loads(response.text)
# print(itemlist)
itemlist['shoppingResult']['products']
print(f'-url주소: {response.url}', f'-상품수: {len(itemlist)}','' ,sep="\n")
for i in itemlist['shoppingResult']['products']:
title = i['productTitle']
price = i['price'],
registerdate = i['openDate'],
print(title, price, registerdate)
if __name__ == "__main__":
scrap()
_