GH/utils/web_scraper.py

99 lines
3.2 KiB
Python
Raw Normal View History

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
def scrape_webpage(url, selector=None):
"""抓取网页内容"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
if selector:
# 根据CSS选择器提取特定内容
elements = soup.select(selector)
content = [elem.get_text(strip=True) for elem in elements]
else:
# 提取所有文本内容
content = soup.get_text(strip=True)
return content
except Exception as e:
raise Exception(f"网页抓取失败: {str(e)}")
def scrape_table_from_webpage(url, table_index=0):
"""从网页中提取表格数据"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
tables = soup.find_all('table')
if not tables:
return None
table = tables[table_index]
# 提取表头
headers = []
header_row = table.find('tr')
if header_row:
headers = [th.get_text(strip=True) for th in header_row.find_all(['th', 'td'])]
# 提取数据行
data = []
rows = table.find_all('tr')[1:] # 跳过表头
for row in rows:
cells = row.find_all(['td', 'th'])
row_data = [cell.get_text(strip=True) for cell in cells]
if row_data:
data.append(row_data)
return headers, data
except Exception as e:
raise Exception(f"网页表格提取失败: {str(e)}")
def web_to_excel(url, output_path, selector=None):
"""将网页内容导出为Excel"""
try:
if selector:
content = scrape_webpage(url, selector)
if isinstance(content, list):
df = pd.DataFrame({
'序号': range(1, len(content) + 1),
'内容': content
})
else:
df = pd.DataFrame({'内容': [content]})
else:
# 尝试提取表格
table_data = scrape_table_from_webpage(url)
if table_data:
headers, data = table_data
df = pd.DataFrame(data, columns=headers)
else:
# 提取普通文本
content = scrape_webpage(url)
# 按段落分割
paragraphs = [p.strip() for p in re.split(r'\n+', content) if p.strip()]
df = pd.DataFrame({
'段落编号': range(1, len(paragraphs) + 1),
'内容': paragraphs
})
df.to_excel(output_path, index=False)
return True
except Exception as e:
raise Exception(f"网页转Excel失败: {str(e)}")