废话不多说:直接上代码!不服就是干!

#!/usr/bin/env python
# -*- coding: UTF-8 -*-
'''=================================================
@Project -> File   :xiangmu -> demo01
@IDE    :PyCharm
@Author :junge
@Date   :2020/10/30 9:28
@Desc   :
=================================================='''
from threading import Thread
from urllib.parse import urljoin
import requests
import re
from openpyxl import Workbook
import time
import os


os.system('pause')
class Spider(Thread):
    def __init__(self, url, encoding='utf-8'):
        super().__init__()
        self.__url = url
        self.__encoding = encoding
        self.__title_re = re.compile(r'<span\sclass="a-size-medium\sa-color-base\sa-text-normal"\sdir="auto">(.*?)</span>', re.I | re.S)
        self.__link_re = re.compile(r'<a\sclass="a-link-normal\sa-text-normal"\shref="(.*?)">', re.I | re.S)
        self.__headers = {
            'authority': 'www.amazon.com',
            'cache-control': 'max-age=0',
            'rtt': '400',
            'downlink': '1.55',
            'ect': '3g',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'sec-fetch-site': 'same-origin',
            'sec-fetch-mode': 'navigate',
            'sec-fetch-user': '?1',
            'sec-fetch-dest': 'document',
            # 'referer': 'https://www.amazon.com/s?__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&i=aps&k=%E7%94%9F%E6%B4%BB%E7%94%A8%E5%93%81&ref=nb_sb_noss&url=search-alias%3Daps',
            'accept-language': 'zh-CN,zh;q=0.9',
            # 'cookie': 'session-id=143-7914905-9412531; session-id-time=2082787201l; i18n-prefs=USD; lc-main=zh_CN; sp-cdn="L5Z9:SG"; skin=noskin; ubid-main=135-29oNxYJZpUioijRRr4xcGtmQim26cI/Q; csm-hit=tb:PMFW0BXYEHAKEVMQTE7V+s-T4HN9WF1JTVCQ0Z8ZQSH|1603882611132&t:1603882611132&adb:adblk_no'

        }
        self.title = ''
        self.all_links = set()


    def run(self):
        html = self.__download()
        self.title = self.__get_title(html)
        self.all_links = self.__get_alllinks(html)

    def __download(self):
        try:
            r = requests.get(self.__url, headers=self.__headers, timeout=5)
        except requests.RequestException as err:
            html = None
            print(f"download {self.__url} error: {err}")
            time.sleep(30)
        else:
            r.encoding = self.__encoding
            html = r.text
        return html

    def __get_title(self, html):
        # 判断HTML是否为字符串类型
        if not isinstance(html, str):
            return
        return self.__title_re.findall(html)

    def __get_alllinks(self, html):
        links = self.__link_re.findall(html)
        return {urljoin(self.__url, link) for link in links}

def xieru_excel(items):
    for title_url in items:
        print(title_url)
        ws.append(title_url)

if __name__ == '__main__':
    newtable = '产品名称和链接.xlsx'
    wb = Workbook()
    ws = wb.active
    headData = ['产品标题', '产品链接']
    ws.append(headData)
    for kwd in open('keywords.txt',mode='r', encoding='utf-8'):
        kwd = kwd.strip()
        url = 'https://www.amazon.com/s?k={0}'.format(kwd)
        t = Spider(url)
        t.start()
        t.join()
        biaotis = t.title
        lianjies = t.all_links
        biaoti_list = [biaoti for biaoti in biaotis]
        lianjie_list = [lianjie for lianjie in lianjies]
        zipped_obj = zip(biaoti_list, lianjie_list)
        xieru_excel(zipped_obj)
        #保存
    wb.save(newtable)


#爬不到数据切换本地ip,可能反爬

发表评论

电子邮件地址不会被公开。 必填项已用*标注