import requests from bs4 import BeautifulSoup import pandas as pd import re def scrape_webpage(url, selector=None): """抓取网页内容""" try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' } response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') if selector: # 根据CSS选择器提取特定内容 elements = soup.select(selector) content = [elem.get_text(strip=True) for elem in elements] else: # 提取所有文本内容 content = soup.get_text(strip=True) return content except Exception as e: raise Exception(f"网页抓取失败: {str(e)}") def scrape_table_from_webpage(url, table_index=0): """从网页中提取表格数据""" try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' } response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') tables = soup.find_all('table') if not tables: return None table = tables[table_index] # 提取表头 headers = [] header_row = table.find('tr') if header_row: headers = [th.get_text(strip=True) for th in header_row.find_all(['th', 'td'])] # 提取数据行 data = [] rows = table.find_all('tr')[1:] # 跳过表头 for row in rows: cells = row.find_all(['td', 'th']) row_data = [cell.get_text(strip=True) for cell in cells] if row_data: data.append(row_data) return headers, data except Exception as e: raise Exception(f"网页表格提取失败: {str(e)}") def web_to_excel(url, output_path, selector=None): """将网页内容导出为Excel""" try: if selector: content = scrape_webpage(url, selector) if isinstance(content, list): df = pd.DataFrame({ '序号': range(1, len(content) + 1), '内容': content }) else: df = pd.DataFrame({'内容': [content]}) else: # 尝试提取表格 table_data = scrape_table_from_webpage(url) if table_data: headers, data = table_data df = pd.DataFrame(data, columns=headers) else: # 提取普通文本 content = scrape_webpage(url) # 按段落分割 paragraphs = [p.strip() for p in re.split(r'\n+', content) if p.strip()] df = pd.DataFrame({ '段落编号': range(1, len(paragraphs) + 1), '内容': paragraphs }) df.to_excel(output_path, index=False) return True except Exception as e: raise Exception(f"网页转Excel失败: {str(e)}")