使用Python对网站xml进行采集,批量保存页面为html文件
场景分析
因为本站主服务器存放着宿舍内,每天晚上(除节假日)宿舍阿姨都会手动拉闸,断网断电。然后晚上如果访问不了网站的话,会影响搜索引擎的爬虫访问。因此解决这个问题我想到了将网页存储为html文件,上传到备用服务器(frp节点服务器)上,此时站点为纯html页面,所以不能进行交互,也就只能看,不能动。
代码
这里我用的是Python,没有部署到服务器上自动运行,目前只是实现了对网站xml采集链接,批量保存html到特定文件夹中,然后再通过同步软件同步到所有frp节点服务器中。
有个小bug,我没有对xml文件进行下载,也就是说搜索引擎还是不能访问到xml文件,后面可能会修复吧,主要改了半天一直下不了(代码都是ai的,自己不会写)。
Python代码
import hashlib
import logging
import os
import re
import requests
import urllib.parse
from bs4 import BeautifulSoup, Tag
from requests.exceptions import RequestException
# 设置日志记录配置
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def is_xml_link(url):
"""判断是否为XML链接"""
return url.endswith(('.xml', '.xml.gz'))
def safe_download_resource(resource_url, local_path, session, timeout=10):
"""下载资源文件到本地路径,支持重试,并设置了超时时间"""
for attempt in range(3):
try:
response = session.get(resource_url, stream=True, timeout=timeout)
if response.status_code == 200:
os.makedirs(os.path.dirname(local_path), exist_ok=True)
with open(local_path, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)
logging.info(f"Downloaded {resource_url} to {local_path}")
break
else:
logging.info(f"Failed to download {resource_url}, status code: {response.status_code}")
except RequestException as e:
if attempt < 2:
logging.info(f"Download error occurred, retrying {attempt + 1}/3: {e}")
continue
else:
logging.info(f"Failed to download after retries: {e}")
return False
return True
def process_html(html_content, base_url, output_folder, session):
"""处理HTML内容,下载并重写资源链接"""
soup = BeautifulSoup(html_content, 'html.parser')
resource_types = ['.jpg', '.png', '.webp', '.gif', '.svg', '.css', '.js']
for tag in soup.find_all(['img', 'link', 'script']):
if tag.name == 'img':
attr_name = 'src'
elif tag.name == 'link' and tag.get('rel') == ['stylesheet']:
attr_name = 'href'
elif tag.name == 'script':
attr_name = 'src'
else:
continue
src = tag.get(attr_name)
if src and not src.startswith(('http:', 'https:')):
src = urllib.parse.urljoin(base_url, src)
if src and any(src.endswith(ext) for ext in resource_types):
local_path = os.path.join(output_folder, src[len(base_url):])
if not safe_download_resource(src, local_path, session):
logging.info(f"Skipped broken resource: {src}")
else:
if tag.name == 'link' or tag.name == 'script':
tag[attr_name] = os.path.relpath(local_path, output_folder)
else: # img标签
tag['src'] = os.path.relpath(local_path, output_folder)
return str(soup)
def check_for_updates_and_download(link, local_path, session, timeout=10):
"""检查文件是否有更新,如有则下载"""
remote_md5 = None
if os.path.exists(local_path):
with open(local_path, 'rb') as file:
local_md5 = hashlib.md5(file.read()).hexdigest()
else:
local_md5 = None
try:
response = session.head(link, allow_redirects=True, timeout=timeout)
if response.status_code == 200:
remote_etag = response.headers.get('ETag')
if remote_etag: # 使用ETag进行快速条件检查
remote_md5 = remote_etag.strip('"')
else: # 如果没有ETag,则实际下载以计算MD5
response = session.get(link, stream=True, timeout=timeout)
if response.status_code == 200:
remote_md5 = hashlib.md5(response.content).hexdigest()
except Exception as e:
logging.info(f"Error checking updates for {link}: {e}")
return False
if remote_md5 != local_md5:
# 内容有更新,下载文件
if safe_download_resource(link, local_path, session, timeout):
logging.info(f"Updated content downloaded from {link} to {local_path}")
return True
else:
logging.info(f"No update needed, skipping download of {link}")
return False
def download_and_save_html_with_resources_updated(link, output_folder='html_files', filename=None):
"""下载HTML及其资源并保存,带有更新检查,同时支持XML文件下载"""
headers = {'User-Agent': 'WNLUOTESTURL'}
try:
response = requests.get(link, headers=headers)
if response.status_code == 200:
content_type = response.headers.get('Content-Type', '').lower()
if 'text/html' in content_type:
# 处理HTML内容
html_content = response.text
base_url = link.rsplit('/', 1)[0] + '/' # 获取基础URL路径
# 使用Session以复用连接并进行重试
with requests.Session() as session:
processed_html = process_html(html_content, base_url, output_folder, session)
# 保存处理后的HTML前,检查更新
if filename:
save_path = os.path.join(output_folder, filename)
else:
save_path = os.path.join(output_folder, os.path.basename(link))
check_for_updates_and_download(link, save_path, session)
# 强制保存首页
if link == 'https://你的站点url':
with open(save_path, 'w', encoding='utf-8') as file:
file.write(processed_html)
logging.info(f"Saved {link} as {save_path} with resources")
elif 'application/xml' in content_type or link.endswith(('.xml', '.xml.gz')):
# 处理XML文件下载
if filename:
save_path = os.path.join(output_folder, filename)
else:
# 保持路径一致,去除base_url部分
relative_path = link.replace(base_url, '', 1) if link.startswith(base_url) else link.split('/')[-1]
save_path = os.path.join(output_folder, relative_path)
# 下载XML文件
if check_for_updates_and_download(link, save_path, session):
logging.info(f"XML file downloaded from {link} to {save_path}")
else:
logging.warning(f"Unsupported content type for {link}: {content_type}")
except Exception as e:
logging.error(f"Error downloading {link} or processing resources: {e}")
def extract_links_from_xml(xml_url, depth=0, max_depth=5):
"""递归从XML文件中提取链接"""
headers = {'User-Agent': 'WNLUOTESTURL'}
if depth > max_depth:
print("Reached maximum recursion depth, stopping.")
return []
links = []
try:
response = requests.get(xml_url, headers=headers)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'xml')
for link in soup.find_all(['loc']):
href = link.text
if is_xml_link(href):
links.extend(extract_links_from_xml(href, depth + 1, max_depth))
elif href.endswith(('.html', '.htm')):
links.append(href)
return links
else:
print(f"Failed to fetch XML at {xml_url}, status code: {response.status_code}")
except Exception as e:
print(f"Error fetching XML: {e}")
return links
def main():
start_xml_url = 'https://你的站点url/sitemap.xml'
all_links = extract_links_from_xml(start_xml_url)
home_page_url = 'https://你的站点url'
all_links.append((home_page_url, 'index.html')) # 添加一个元组,包含链接和期望的文件名
for link in all_links:
if isinstance(link, tuple): # 检查是否为特殊处理的首页链接
link_url, filename = link
download_and_save_html_with_resources_updated(link_url, output_folder='html_files', filename=filename)
elif link.endswith(('.html', '.htm')):
download_and_save_html_with_resources_updated(link)
# 初始化开始的XML URL
start_xml_url = 'https://你的站点url/sitemap.xml'
# 下载初始的sitemap.xml文件
sitemap_local_path = 'sitemap.xml'
download_and_save_html_with_resources_updated(start_xml_url, output_folder='.', filename=sitemap_local_path)
# 从下载的sitemap.xml中提取所有XML链接
all_xml_links = extract_links_from_xml(sitemap_local_path)
# 下载提取到的所有XML链接
for xml_link in all_xml_links:
# 确保只处理XML链接,避免重复下载'sitemap.xml'自身
if xml_link != start_xml_url:
download_and_save_html_with_resources_updated(xml_link, output_folder='xml_files')
if __name__ == "__main__":
main()
因为是AI写的,可能有些地方是重复的,记得把“你的站点url
”改成你自己的站点域名。
我自己测试是可行的,而且我还运用到了实际环境中。
保存完成后,可以在文件夹中找到相应内容。
然后我用的是威力同步来进行同步的,配置好后就不需要每次都去一个服务器一个服务器的上传文件。
让同步软件自己完成就行了。
每次采集完,基本上过分钟就同步好了。
阅读剩余
THE END