#!/usr/bin/env python
# -*- coding: UTF-8 -*-
'''=================================================
@Project -> File   :xiangmu -> 爬取网页
@IDE    :PyCharm
@Author :junge
@Date   :2020/9/26 14:05
@Desc   :
=================================================='''
import requests
import re
from urllib.parse import urljoin
import os

curdir = os.path.dirname(os.path.abspath(__file__))
os.chdir(curdir)
invalid_char = re.compile(r'[\\/:*?<>|"\n()-]')

def download(url):
    headers = {
      'Connection': 'keep-alive',
      'Upgrade-Insecure-Requests': '1',
      'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36',
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
      'Accept-Language': 'en,en-US;q=0.9,zh-CN;q=0.8,zh;q=0.7,und;q=0.6',
    }

    response = requests.request("GET", url, headers=headers)
    response.encoding = 'utf-8' #手动指定编码
    return response.text


def extract_url(base,source):
    urls = re.findall(r'href="(/zhiwu/\d+\.html)"',source,re.I)
    return {urljoin(base,url)for url in urls}


def html_one(source):
    folder_path = os.path.join(curdir)
    titles = re.findall(r'<h3>([^a-z]+?)</h3>',source,re.I)
    jx = re.compile(r'<div class="detail_article">(.*?)</div>',re.I|re.S)
    contents = jx.findall(source)
    # print(title,content)
    for title in titles:
        title = invalid_char.sub('', title)
        # print(title)
        for content in contents:
            file_path = os.path.join(folder_path, f'{title}.txt')
            with open(file_path,mode='w+',encoding='utf-8') as t:
                t.write(content)
                print('写入文章',title)

if __name__ == '__main__':
    for i in range(1,10):
        link = "http://www.tvix.cn/caoben/list{}.html".format(i)
        # abs = urljoin(link,'/zhiwu/2019036245.html')
        html = download(link)
        # print(html)
        links = extract_url(link,html)
        for link_url in links:
            one_html = download(link_url)
            html_one(one_html)

发表评论

电子邮件地址不会被公开。 必填项已用*标注