본문으로 바로가기

네이버쇼핑 크롤링

category CAE/Enjoy Programming 2019. 12. 27. 17:14
## parser_custom.py

from bs4 import BeautifulSoup

def getProductInfo(li):
    # print(li)
    img = li.find("img")
    alt = img['alt']
    priceReload = li.find("span", {"class":"num"})
    aTit = li.find("a", {"class":"link"})
    href = aTit['href']

    return {"name":alt, "price":priceReload.text.replace(",", ""), "link":href}

    # try:
    #     img = li.find("img")
    #     alt = img['alt']
    #     priceReload = li.find("span", {"class":"_price_reload"})
    #     aTit = li.find("a", {"class":"link"})
    #     href = aTit['href']

    #     return {"name":alt, "price":priceReload.text.replace(",", ""), "link":href}
    # except AttributeError as e:
    #     img = li.find("img")
    #     alt = img['alt']
    #     priceReload = li.find("span", {"class":"num"})
    #     aTit = li.find("a", {"class":"link"})
    #     href = aTit['href']

    #     return {"name":alt, "price":priceReload.text.replace(",", ""), "link":href}
    #     # return {"name":'', "price":'', "link":''}
    
    
def parse(pageString):
    bsObj = BeautifulSoup(pageString, "html.parser")
    ul = bsObj.find("ul", {"class":"goods_list"})
    lis = ul.findAll("li", {"class":"_itemSection"})
    # print(len(lis))
    # print(lis[0])

    products = []
    for li in lis:  # :1으로 입력하면 1개만 테스트로. 20번째에 문제가 있는 것으로 보임
        product = getProductInfo(li)
        products.append(product)

    return products
## stage_naver_shopping_paging.py

import requests
from parser_custom import parse
import json

def crawl(productName, pageNo):
    url = "https://search.shopping.naver.com/search/all.nhn?query={}&pagingIndex={}&cat_id=&frm=NVSHATC".format(productName, pageNo)
    data = requests.get(url)
    print(data, url)
    return data.content

productName = "콘센트"

totalProducts = []
for pageNo in range(1, 10+1):   # 1 페이지부터 N 페이지 까지
    pageString = crawl(productName, pageNo)
    products = parse(pageString)
    totalProducts += products

print(totalProducts)
print(len(totalProducts))

file = open("./products.json", "w+")
file.write(json.dumps(totalProducts))
## analyze.py

import pandas as pd

df = pd.read_json("./products.json")

# print(df.count())

writer = pd.ExcelWriter("products.xlsx",options={'strings_to_urls': False})
df.to_excel(writer, "sheet1")
writer.save()