前言

录入商品数据太痛苦了,但是又不能不录,所以这时候需要写一个python脚本来完成,仅供参考。

开始

可能会给一个exec表格给你,或者一个文件夹条码图片给你.....要你根据条码(商品名)找到对应的图片,保存起来。

项目代码地址

1.通过百度搜索爬取商品图片

# 通过配合商品名通过百度找图片
def getBaiDu(shop_id, search_title):
    baidu_url ="http://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1460997499750_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word={}".format(search_title)
    result = requests.get(baidu_url, headers=headers)
    dowmloadPic(result.text, shop_id)

def dowmloadPic(html, shop_id):
    # 爬取多少张
    num_download = 5
    # 新建目录
    mkdir(path + '\\' + shop_id)
    for addr in re.findall('"objURL":"(.*?)"', html, re.S):
        if num_download < 0:
            break
        logger.info('现在正在爬取URL中的地址:' + str(addr))
        try:
            pic = requests.get(addr, timeout=10, headers=headers)
        except requests.exceptions.ConnectionError:
            logger.info('您当前的URL出现错误!')
            continue
        localtime = time.strftime("%Y%m%d%H%M%S", time.localtime())
        fn = open(path + '\\' + shop_id + '\\' + str(localtime) +'.png','wb')
        fn.write(pic.content)
        fn.close()

        # drop_wartermark(path + '\\' + shop_id + '\\' + str(localtime) +'.png', path + '\\' + shop_id + '\\' + str(localtime) +'-0.png')

        num_download = num_download - 1
        logger.info(path + '\\' + shop_id + '\\' + str(localtime) +'.png')

2.根据(tiaoma.cnaidc.com)网站搜索商品信息

# 爬取 "tiaoma.cnaidc.com" 来查找商品信息
def requestT1(shop_id):
    url = 'http://tiaoma.cnaidc.com'
    s = requests.session()

    # 获取验证码
    img_data  = s.get(url + '/index/verify.html?time=',  headers=headers).content
    with open('verification_code.png','wb') as v:
        v.write(img_data)

    # 解验证码
    ocr = ddddocr.DdddOcr()
    with open('verification_code.png', 'rb') as f:
        img_bytes = f.read()
    code = ocr.classification(img_bytes)
    logger.info('当前验证码为 ' + code)
    # 请求接口参数
    data = {"code": shop_id, "verify": code}
    resp = s.post(url + '/index/search.html',headers=headers,data=data)
    resp_json = parse_json(resp.text)
    logger.info(resp_json)
    # 判断是否查询成功
    if resp_json['msg'] == '查询成功' and resp_json['json'].get('code_img'):
        # 保存商品图片
        img_url = ''
        if resp_json['json']['code_img'].find('http') == -1:
            img_url =  url + resp_json['json']['code_img']
        else:
            img_url =  resp_json['json']['code_img']

        try:
            shop_img_data  = s.get(img_url,  headers=headers, timeout=10,).content
             # 新建目录
            mkdir(path + '\\' + shop_id)
            localtime = time.strftime("%Y%m%d%H%M%S", time.localtime())
            # 保存图片
            with open(path + '\\' + shop_id + '\\' + str(localtime) +'.png','wb') as v:
                v.write(shop_img_data)
            logger.info(path + '\\' + shop_id + '\\' + str(localtime) +'.png')
        except requests.exceptions.ConnectionError:
            logger.info('访问图片URL出现错误!') 
       
    if resp_json['msg'] == '验证码错误':
        requestT1(shop_id)

3.根据 中国物品编码 搜索商品信息

# 中国物品编码
def requestT2(shop_id):
    s = requests.session()
    t2_url = 'http://search.anccnet.com/searchResult2.aspx?keyword='+ shop_id
    headers['Cookie'] = 'ASP.NET_SessionId=blgmvuf5s54mtz45si25rga2' # 需要手动获取
    headers['Host'] = 'search.anccnet.com'
    headers['Referer'] = 'http://search.anccnet.com/searchResult2.aspx'
    resp = s.get(t2_url, headers=headers)
    soup = BeautifulSoup(resp.text, 'lxml')
    results = soup.find(attrs={"id":"results"})
    if results:
        for div_tag in results.find_all('div', {'class': 'result'}):
            p_info = div_tag.find('dl', {'class': 'p-info'})
            dd = p_info.find_all('dd')
            shop_name = dd[3].text
            a = dd[0].find('a')
            # 获取商品详情的大图
            shop_resp = s.get(a['href'], headers=headers)
            shop_soup = BeautifulSoup(shop_resp.text, 'lxml')
            results_img = shop_soup.find(attrs={"id":"imageListDiv"})
            if results_img:
                # 新建目录
                mkdir(path + '\\' + shop_id)
                for img_tag in results_img.find_all('img'):
                    try:
                        pic = requests.get(img_tag['src'], timeout=10, headers=headers)
                    except requests.exceptions.ConnectionError:
                        logger.info('访问商品图片出现错误!')
                        continue
                    localtime = time.strftime("%Y%m%d%H%M%S", time.localtime())
                    fn = open(path + '\\' + shop_id + '\\' + str(localtime) +'.png','wb')
                    fn.write(pic.content)
                    fn.close()
                    logger.info(path + '\\' + shop_id + '\\' + str(localtime) +'.png')
            else:
                logger.info('条码:{0} 商品名:{1} 当前抓取商品无图片!'.format(shop_id, shop_name))
    else:
        logger.info('当前访问过快啦~中国物品编码拒绝了我们的访问, 请等待30秒在访问')
        time.sleep(30)
        requestT2(shop_id)

Logo

腾讯云面向开发者汇聚海量精品云计算使用和开发经验,营造开放的云计算技术生态圈。

更多推荐