- 添加MDF文件导出功能 - 集成阿里云OCR大模型识别 - 添加百度智能云AI照片评分 - 集成DeepSeek大模型创意文案生成 - 完善文档和配置管理 - 使用uv进行现代化依赖管理 - 添加完整的.gitignore配置
99 lines
3.2 KiB
Python
99 lines
3.2 KiB
Python
import requests
|
|
from bs4 import BeautifulSoup
|
|
import pandas as pd
|
|
import re
|
|
|
|
def scrape_webpage(url, selector=None):
|
|
"""抓取网页内容"""
|
|
try:
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
|
}
|
|
|
|
response = requests.get(url, headers=headers, timeout=10)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
|
if selector:
|
|
# 根据CSS选择器提取特定内容
|
|
elements = soup.select(selector)
|
|
content = [elem.get_text(strip=True) for elem in elements]
|
|
else:
|
|
# 提取所有文本内容
|
|
content = soup.get_text(strip=True)
|
|
|
|
return content
|
|
except Exception as e:
|
|
raise Exception(f"网页抓取失败: {str(e)}")
|
|
|
|
def scrape_table_from_webpage(url, table_index=0):
|
|
"""从网页中提取表格数据"""
|
|
try:
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
|
}
|
|
|
|
response = requests.get(url, headers=headers, timeout=10)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
tables = soup.find_all('table')
|
|
|
|
if not tables:
|
|
return None
|
|
|
|
table = tables[table_index]
|
|
|
|
# 提取表头
|
|
headers = []
|
|
header_row = table.find('tr')
|
|
if header_row:
|
|
headers = [th.get_text(strip=True) for th in header_row.find_all(['th', 'td'])]
|
|
|
|
# 提取数据行
|
|
data = []
|
|
rows = table.find_all('tr')[1:] # 跳过表头
|
|
|
|
for row in rows:
|
|
cells = row.find_all(['td', 'th'])
|
|
row_data = [cell.get_text(strip=True) for cell in cells]
|
|
if row_data:
|
|
data.append(row_data)
|
|
|
|
return headers, data
|
|
except Exception as e:
|
|
raise Exception(f"网页表格提取失败: {str(e)}")
|
|
|
|
def web_to_excel(url, output_path, selector=None):
|
|
"""将网页内容导出为Excel"""
|
|
try:
|
|
if selector:
|
|
content = scrape_webpage(url, selector)
|
|
if isinstance(content, list):
|
|
df = pd.DataFrame({
|
|
'序号': range(1, len(content) + 1),
|
|
'内容': content
|
|
})
|
|
else:
|
|
df = pd.DataFrame({'内容': [content]})
|
|
else:
|
|
# 尝试提取表格
|
|
table_data = scrape_table_from_webpage(url)
|
|
if table_data:
|
|
headers, data = table_data
|
|
df = pd.DataFrame(data, columns=headers)
|
|
else:
|
|
# 提取普通文本
|
|
content = scrape_webpage(url)
|
|
# 按段落分割
|
|
paragraphs = [p.strip() for p in re.split(r'\n+', content) if p.strip()]
|
|
df = pd.DataFrame({
|
|
'段落编号': range(1, len(paragraphs) + 1),
|
|
'内容': paragraphs
|
|
})
|
|
|
|
df.to_excel(output_path, index=False)
|
|
return True
|
|
except Exception as e:
|
|
raise Exception(f"网页转Excel失败: {str(e)}") |