使用Python对网站xml进行采集,批量保存页面为html文件

场景分析

因为本站主服务器存放着宿舍内,每天晚上(除节假日)宿舍阿姨都会手动拉闸,断网断电。然后晚上如果访问不了网站的话,会影响搜索引擎的爬虫访问。因此解决这个问题我想到了将网页存储为html文件,上传到备用服务器(frp节点服务器)上,此时站点为纯html页面,所以不能进行交互,也就只能看,不能动。

代码

这里我用的是Python,没有部署到服务器上自动运行,目前只是实现了对网站xml采集链接,批量保存html到特定文件夹中,然后再通过同步软件同步到所有frp节点服务器中。

有个小bug,我没有对xml文件进行下载,也就是说搜索引擎还是不能访问到xml文件,后面可能会修复吧,主要改了半天一直下不了(代码都是ai的,自己不会写)。

Python代码

import hashlib
import logging
import os
import re
import requests
import urllib.parse
from bs4 import BeautifulSoup, Tag
from requests.exceptions import RequestException

# 设置日志记录配置
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def is_xml_link(url):
    """判断是否为XML链接"""
    return url.endswith(('.xml', '.xml.gz'))

def safe_download_resource(resource_url, local_path, session, timeout=10):
    """下载资源文件到本地路径,支持重试,并设置了超时时间"""
    for attempt in range(3):
        try:
            response = session.get(resource_url, stream=True, timeout=timeout)
            if response.status_code == 200:
                os.makedirs(os.path.dirname(local_path), exist_ok=True)
                with open(local_path, 'wb') as file:
                    for chunk in response.iter_content(chunk_size=8192):
                        file.write(chunk)
                logging.info(f"Downloaded {resource_url} to {local_path}")
                break
            else:
                logging.info(f"Failed to download {resource_url}, status code: {response.status_code}")
        except RequestException as e:
            if attempt < 2:
                logging.info(f"Download error occurred, retrying {attempt + 1}/3: {e}")
                continue
            else:
                logging.info(f"Failed to download after retries: {e}")
                return False
    return True

def process_html(html_content, base_url, output_folder, session):
    """处理HTML内容,下载并重写资源链接"""
    soup = BeautifulSoup(html_content, 'html.parser')
    resource_types = ['.jpg', '.png', '.webp', '.gif', '.svg', '.css', '.js']

    for tag in soup.find_all(['img', 'link', 'script']):
        if tag.name == 'img':
            attr_name = 'src'
        elif tag.name == 'link' and tag.get('rel') == ['stylesheet']:
            attr_name = 'href'
        elif tag.name == 'script':
            attr_name = 'src'
        else:
            continue

        src = tag.get(attr_name)
        if src and not src.startswith(('http:', 'https:')):
            src = urllib.parse.urljoin(base_url, src)

        if src and any(src.endswith(ext) for ext in resource_types):
            local_path = os.path.join(output_folder, src[len(base_url):])
            if not safe_download_resource(src, local_path, session):
                logging.info(f"Skipped broken resource: {src}")
            else:
                if tag.name == 'link' or tag.name == 'script':
                    tag[attr_name] = os.path.relpath(local_path, output_folder)
                else:  # img标签
                    tag['src'] = os.path.relpath(local_path, output_folder)

    return str(soup)

def check_for_updates_and_download(link, local_path, session, timeout=10):
    """检查文件是否有更新,如有则下载"""
    remote_md5 = None
    if os.path.exists(local_path):
        with open(local_path, 'rb') as file:
            local_md5 = hashlib.md5(file.read()).hexdigest()
    else:
        local_md5 = None

    try:
        response = session.head(link, allow_redirects=True, timeout=timeout)
        if response.status_code == 200:
            remote_etag = response.headers.get('ETag')
            if remote_etag:  # 使用ETag进行快速条件检查
                remote_md5 = remote_etag.strip('"')
            else:  # 如果没有ETag,则实际下载以计算MD5
                response = session.get(link, stream=True, timeout=timeout)
                if response.status_code == 200:
                    remote_md5 = hashlib.md5(response.content).hexdigest()
    except Exception as e:
        logging.info(f"Error checking updates for {link}: {e}")
        return False

    if remote_md5 != local_md5:
        # 内容有更新,下载文件
        if safe_download_resource(link, local_path, session, timeout):
            logging.info(f"Updated content downloaded from {link} to {local_path}")
            return True
    else:
        logging.info(f"No update needed, skipping download of {link}")
        return False


def download_and_save_html_with_resources_updated(link, output_folder='html_files', filename=None):
    """下载HTML及其资源并保存,带有更新检查,同时支持XML文件下载"""
    headers = {'User-Agent': 'WNLUOTESTURL'}
    try:
        response = requests.get(link, headers=headers)
        if response.status_code == 200:
            content_type = response.headers.get('Content-Type', '').lower()
            if 'text/html' in content_type:
                # 处理HTML内容
                html_content = response.text
                base_url = link.rsplit('/', 1)[0] + '/'  # 获取基础URL路径
                
                # 使用Session以复用连接并进行重试
                with requests.Session() as session:
                    processed_html = process_html(html_content, base_url, output_folder, session)

                    # 保存处理后的HTML前,检查更新
                    if filename:
                        save_path = os.path.join(output_folder, filename)
                    else:
                        save_path = os.path.join(output_folder, os.path.basename(link))
                    check_for_updates_and_download(link, save_path, session)
                    # 强制保存首页
                    if link == 'https://你的站点url':
                        with open(save_path, 'w', encoding='utf-8') as file:
                            file.write(processed_html)
                            logging.info(f"Saved {link} as {save_path} with resources")
            elif 'application/xml' in content_type or link.endswith(('.xml', '.xml.gz')):
                # 处理XML文件下载
                if filename:
                    save_path = os.path.join(output_folder, filename)
                else:
                    # 保持路径一致,去除base_url部分
                    relative_path = link.replace(base_url, '', 1) if link.startswith(base_url) else link.split('/')[-1]
                    save_path = os.path.join(output_folder, relative_path)
                
                # 下载XML文件
                if check_for_updates_and_download(link, save_path, session):
                    logging.info(f"XML file downloaded from {link} to {save_path}")
            else:
                logging.warning(f"Unsupported content type for {link}: {content_type}")
    except Exception as e:
        logging.error(f"Error downloading {link} or processing resources: {e}")




def extract_links_from_xml(xml_url, depth=0, max_depth=5):
    """递归从XML文件中提取链接"""
    headers = {'User-Agent': 'WNLUOTESTURL'}
    if depth > max_depth:
        print("Reached maximum recursion depth, stopping.")
        return []
    links = []
    try:
        response = requests.get(xml_url, headers=headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'xml')
            for link in soup.find_all(['loc']):
                href = link.text
                if is_xml_link(href):
                    links.extend(extract_links_from_xml(href, depth + 1, max_depth))
                elif href.endswith(('.html', '.htm')):
                    links.append(href)
            return links
        else:
            print(f"Failed to fetch XML at {xml_url}, status code: {response.status_code}")
    except Exception as e:
        print(f"Error fetching XML: {e}")
    return links




def main():
    start_xml_url = 'https://你的站点url/sitemap.xml'
    all_links = extract_links_from_xml(start_xml_url)

    home_page_url = 'https://你的站点url'
    all_links.append((home_page_url, 'index.html'))  # 添加一个元组,包含链接和期望的文件名

    for link in all_links:
        if isinstance(link, tuple):  # 检查是否为特殊处理的首页链接
            link_url, filename = link
            download_and_save_html_with_resources_updated(link_url, output_folder='html_files', filename=filename)
        elif link.endswith(('.html', '.htm')):
            download_and_save_html_with_resources_updated(link)

    # 初始化开始的XML URL
    start_xml_url = 'https://你的站点url/sitemap.xml'
    
    # 下载初始的sitemap.xml文件
    sitemap_local_path = 'sitemap.xml'
    download_and_save_html_with_resources_updated(start_xml_url, output_folder='.', filename=sitemap_local_path)
    
    # 从下载的sitemap.xml中提取所有XML链接
    all_xml_links = extract_links_from_xml(sitemap_local_path)
    
    # 下载提取到的所有XML链接
    for xml_link in all_xml_links:
        # 确保只处理XML链接,避免重复下载'sitemap.xml'自身
        if xml_link != start_xml_url:
            download_and_save_html_with_resources_updated(xml_link, output_folder='xml_files')

if __name__ == "__main__":
    main()

因为是AI写的,可能有些地方是重复的,记得把“你的站点url”改成你自己的站点域名。

我自己测试是可行的,而且我还运用到了实际环境中。

保存完成后,可以在文件夹中找到相应内容。

然后我用的是威力同步来进行同步的,配置好后就不需要每次都去一个服务器一个服务器的上传文件。

让同步软件自己完成就行了。

每次采集完,基本上过分钟就同步好了。

阅读剩余
THE END