## parser_custom.py
from bs4 import BeautifulSoup
def getProductInfo(li):
# print(li)
img = li.find("img")
alt = img['alt']
priceReload = li.find("span", {"class":"num"})
aTit = li.find("a", {"class":"link"})
href = aTit['href']
return {"name":alt, "price":priceReload.text.replace(",", ""), "link":href}
# try:
# img = li.find("img")
# alt = img['alt']
# priceReload = li.find("span", {"class":"_price_reload"})
# aTit = li.find("a", {"class":"link"})
# href = aTit['href']
# return {"name":alt, "price":priceReload.text.replace(",", ""), "link":href}
# except AttributeError as e:
# img = li.find("img")
# alt = img['alt']
# priceReload = li.find("span", {"class":"num"})
# aTit = li.find("a", {"class":"link"})
# href = aTit['href']
# return {"name":alt, "price":priceReload.text.replace(",", ""), "link":href}
# # return {"name":'', "price":'', "link":''}
def parse(pageString):
bsObj = BeautifulSoup(pageString, "html.parser")
ul = bsObj.find("ul", {"class":"goods_list"})
lis = ul.findAll("li", {"class":"_itemSection"})
# print(len(lis))
# print(lis[0])
products = []
for li in lis: # :1으로 입력하면 1개만 테스트로. 20번째에 문제가 있는 것으로 보임
product = getProductInfo(li)
products.append(product)
return products
## stage_naver_shopping_paging.py
import requests
from parser_custom import parse
import json
def crawl(productName, pageNo):
url = "https://search.shopping.naver.com/search/all.nhn?query={}&pagingIndex={}&cat_id=&frm=NVSHATC".format(productName, pageNo)
data = requests.get(url)
print(data, url)
return data.content
productName = "콘센트"
totalProducts = []
for pageNo in range(1, 10+1): # 1 페이지부터 N 페이지 까지
pageString = crawl(productName, pageNo)
products = parse(pageString)
totalProducts += products
print(totalProducts)
print(len(totalProducts))
file = open("./products.json", "w+")
file.write(json.dumps(totalProducts))
## analyze.py
import pandas as pd
df = pd.read_json("./products.json")
# print(df.count())
writer = pd.ExcelWriter("products.xlsx",options={'strings_to_urls': False})
df.to_excel(writer, "sheet1")
writer.save()